In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sys
import os
import time

import pyspark

from sklearn.metrics import roc_auc_score
In [2]:
ls
'AML Class code'/                               grid_search_results_xgb.csv
 best_nn_model.keras                            Modelling.ipynb
 best_xgb_model.json                            my_model.h5
'Credit Risk Project.docx'*                    'Older Modelling File.ipynb'
 DataPrepFull_Modelling-Approach2+Strat.ipynb   Read.ipynb
 DataPrepFull_Modelling.ipynb                   sample_submission.csv*
 dev.csv/                                       test_data.csv*
 Exploration.ipynb                              train_data.csv*
 feature_importance_xgb_m1.xlsx                 train_labels.csv*
 feature_importance_xgb_m2.xlsx                 Untitled1.ipynb
'Final_DAta Prep Steps.ipynb'                   Untitled2.ipynb
 grid_search_results.csv                        Untitled.ipynb
 grid_search_results_nn.csv                     X_train.csv
In [3]:
df = pd.read_csv('dev.csv/part-00000-ee748d50-0c69-46e0-bdfd-03dac1fb4272-c000.csv')
In [4]:
# set options to see the dataframe entirely
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

pd.set_option('display.max_colwidth', None)
In [5]:
df.head(5)
Out[5]:
customer_ID target S_2 P_2 D_39 B_1 B_2 R_1 S_3 D_41 B_3 D_42 D_43 D_44 B_4 D_45 B_5 R_2 D_46 D_47 D_48 D_49 B_6 B_7 B_8 D_50 D_51 B_9 R_3 D_52 P_3 B_10 D_53 S_5 B_11 S_6 D_54 R_4 S_7 B_12 S_8 D_55 D_56 B_13 R_5 D_58 S_9 B_14 D_59 D_60 D_61 B_15 S_11 D_62 D_63 D_64 D_65 B_16 B_17 B_18 B_19 D_66 B_20 D_68 S_12 R_6 S_13 B_21 D_69 B_22 D_70 D_71 D_72 S_15 B_23 D_73 P_4 D_74 D_75 D_76 B_24 R_7 D_77 B_25 B_26 D_78 D_79 R_8 R_9 S_16 D_80 R_10 R_11 B_27 D_81 D_82 S_17 R_12 B_28 R_13 D_83 R_14 R_15 D_84 R_16 B_29 B_30 S_18 D_86 D_87 R_17 R_18 D_88 B_31 S_19 R_19 B_32 S_20 R_20 R_21 B_33 D_89 R_22 R_23 D_91 D_92 D_93 D_94 R_24 R_25 D_96 S_22 S_23 S_24 S_25 S_26 D_102 D_103 D_104 D_105 D_106 D_107 B_36 B_37 R_26 R_27 B_38 D_108 D_109 D_110 D_111 B_39 D_112 B_40 S_27 D_113 D_114 D_115 D_116 D_117 D_118 D_119 D_120 D_121 D_122 D_123 D_124 D_125 D_126 D_127 D_128 D_129 B_41 B_42 D_130 D_131 D_132 D_133 R_28 D_134 D_135 D_136 D_137 D_138 D_139 D_140 D_141 D_142 D_143 D_144 D_145
0 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a 0 2017-03-09 0.938469 0.001733 0.008724 1.006838 0.009228 0.124035 0.008771 0.004709 NaN NaN 0.000630 0.080986 0.708906 0.170600 0.006204 0.358587 0.525351 0.255736 NaN 0.063902 0.059416 0.006466 0.148698 1.335856 0.008207 0.001423 0.207334 0.736463 0.096219 NaN 0.023381 0.002768 0.008322 1.001519 0.008298 0.161345 0.148266 0.922998 0.354596 0.152025 0.118075 0.001882 0.158612 0.065728 0.018385 0.063646 0.199617 0.308233 0.016361 0.401619 0.091071 CR O 0.007126 0.007665 NaN 0.652984 0.008520 NaN 0.004730 6.0 0.272008 0.008363 0.515222 0.002644 0.009013 0.004808 0.008342 0.119403 0.004802 0.108271 0.050882 NaN 0.007554 0.080422 0.069067 NaN 0.004327 0.007562 NaN 0.007729 0.000272 0.001576 0.004239 0.001434 NaN 0.002271 0.004061 0.007121 0.002456 0.002310 0.003532 0.506612 0.008033 1.009825 0.084683 0.003820 0.007043 0.000438 0.006452 0.000830 0.005055 NaN 0.0 0.005720 0.007084 NaN 0.000198 0.008907 NaN 1 0.002537 0.005177 0.006626 0.009705 0.007782 0.002450 1.001101 0.002665 0.007479 0.006893 1.503673 1.006133 0.003569 0.008871 0.003950 0.003647 0.004950 0.894090 0.135561 0.911191 0.974539 0.001243 0.766688 1.008691 1.004587 0.893734 NaN 0.670041 0.009968 0.004572 NaN 1.008949 2.0 NaN 0.004326 NaN NaN NaN 1.007336 0.210060 0.676922 0.007871 1.0 0.238250 0.0 4.0 0.232120 0.236266 0.0 0.702280 0.434345 0.003057 0.686516 0.008740 1.0 1.003319 1.007819 1.000080 0.006805 NaN 0.002052 0.005972 NaN 0.004345 0.001535 NaN NaN NaN NaN NaN 0.002427 0.003706 0.003818 NaN 0.000569 0.000610 0.002674
1 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a 0 2017-04-07 0.936665 0.005775 0.004923 1.000653 0.006151 0.126750 0.000798 0.002714 NaN NaN 0.002526 0.069419 0.712795 0.113239 0.006206 0.353630 0.521311 0.223329 NaN 0.065261 0.057744 0.001614 0.149723 1.339794 0.008373 0.001984 0.202778 0.720886 0.099804 NaN 0.030599 0.002749 0.002482 1.009033 0.005136 0.140951 0.143530 0.919414 0.326757 0.156201 0.118737 0.001610 0.148459 0.093935 0.013035 0.065501 0.151387 0.265026 0.017688 0.406326 0.086805 CR O 0.002413 0.007148 NaN 0.647093 0.002238 NaN 0.003879 6.0 0.188970 0.004030 0.509048 0.004193 0.007842 0.001283 0.006524 0.140611 0.000094 0.101018 0.040469 NaN 0.004832 0.081413 0.074166 NaN 0.004203 0.005304 NaN 0.001864 0.000979 0.009896 0.007597 0.000509 NaN 0.009810 0.000127 0.005966 0.000395 0.001327 0.007773 0.500855 0.000760 1.009461 0.081843 0.000347 0.007789 0.004311 0.002332 0.009469 0.003753 NaN 0.0 0.007584 0.006677 NaN 0.001142 0.005907 NaN 1 0.008427 0.008979 0.001854 0.009924 0.005987 0.002247 1.006779 0.002508 0.006827 0.002837 1.503577 1.005791 0.000571 0.000391 0.008351 0.008850 0.003180 0.902135 0.136333 0.919876 0.975624 0.004561 0.786007 1.000084 1.004118 0.906841 NaN 0.668647 0.003921 0.004654 NaN 1.003205 2.0 NaN 0.008707 NaN NaN NaN 1.007653 0.184093 0.822281 0.003444 1.0 0.247217 0.0 4.0 0.243532 0.241885 0.0 0.707017 0.430501 0.001306 0.686414 0.000755 1.0 1.008394 1.004333 1.008344 0.004407 NaN 0.001034 0.004838 NaN 0.007495 0.004931 NaN NaN NaN NaN NaN 0.003954 0.003167 0.005032 NaN 0.009576 0.005492 0.009217
2 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a 0 2017-05-28 0.954180 0.091505 0.021655 1.009672 0.006815 0.123977 0.007598 0.009423 NaN NaN 0.007605 0.068839 0.720884 0.060492 0.003259 0.334650 0.524568 0.189424 NaN 0.066982 0.056647 0.005126 0.151955 1.337179 0.009355 0.007426 0.206629 0.738044 0.134073 NaN 0.048367 0.010077 0.000530 1.009184 0.006961 0.112229 0.137014 1.001977 0.304124 0.153795 0.114534 0.006328 0.139504 0.084757 0.056653 0.070607 0.305883 0.212165 0.063955 0.406768 0.094001 CR O 0.001878 0.003636 NaN 0.645819 0.000408 NaN 0.004578 6.0 0.495308 0.006838 0.679257 0.001337 0.006025 0.009393 0.002615 0.075868 0.007152 0.103239 0.047454 NaN 0.006561 0.078891 0.076510 NaN 0.001782 0.001422 NaN 0.005419 0.006149 0.009629 0.003094 0.008295 NaN 0.009362 0.000954 0.005447 0.007345 0.007624 0.008811 0.504606 0.004056 1.004291 0.081954 0.002709 0.004093 0.007139 0.008358 0.002325 0.007381 NaN 0.0 0.005901 0.001185 NaN 0.008013 0.008882 NaN 1 0.007327 0.002016 0.008686 0.008446 0.007291 0.007794 1.001014 0.009634 0.009820 0.005080 1.503359 1.005801 0.007425 0.009234 0.002471 0.009769 0.005433 0.939654 0.134938 0.958699 0.974067 0.011736 0.806840 1.003014 1.009285 0.928719 NaN 0.670901 0.001264 0.019176 NaN 1.000754 2.0 NaN 0.004092 NaN NaN NaN 1.004312 0.154837 0.853498 0.003269 1.0 0.239867 0.0 4.0 0.240768 0.239710 0.0 0.704843 0.434409 0.003954 0.690101 0.009617 1.0 1.009307 1.007831 1.006878 0.003221 NaN 0.005681 0.005497 NaN 0.009227 0.009123 NaN NaN NaN NaN NaN 0.003269 0.007329 0.000427 NaN 0.003429 0.006986 0.002603
3 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a 0 2017-06-13 0.960384 0.002455 0.013683 1.002700 0.001373 0.117169 0.000685 0.005531 NaN NaN 0.006406 0.055630 0.723997 0.166782 0.009918 0.323271 0.530929 0.135586 NaN 0.083720 0.049253 0.001418 0.151219 1.339909 0.006782 0.003515 0.208214 0.741813 0.134437 NaN 0.030063 0.009667 0.000783 1.007456 0.008706 0.102838 0.129017 0.704016 0.275055 0.155772 0.120740 0.004980 0.138100 0.048382 0.012498 0.065926 0.273553 0.204300 0.022732 0.405175 0.094854 CR O 0.005899 0.005896 NaN 0.654358 0.005897 NaN 0.005207 6.0 0.508670 0.008183 0.515282 0.008716 0.005271 0.004554 0.002052 0.150209 0.005364 0.206394 0.031705 NaN 0.009559 0.077490 0.071547 NaN 0.005595 0.006363 NaN 0.000646 0.009193 0.008568 0.003895 0.005153 NaN 0.004876 0.005665 0.001888 0.004961 0.000034 0.004652 0.508998 0.006969 1.004728 0.060634 0.009982 0.008817 0.008690 0.007364 0.005924 0.008802 NaN 0.0 0.002520 0.003324 NaN 0.009455 0.008348 NaN 1 0.007053 0.003909 0.002478 0.006614 0.009977 0.007686 1.002775 0.007791 0.000458 0.007320 1.503701 1.007036 0.000664 0.003200 0.008507 0.004858 0.000063 0.913205 0.140058 0.926341 0.975499 0.007571 0.808214 1.001517 1.004514 0.935383 NaN 0.672620 0.002729 0.011720 NaN 1.005338 2.0 NaN 0.009703 NaN NaN NaN 1.002538 0.153939 0.844667 0.000053 1.0 0.240910 0.0 4.0 0.239400 0.240727 0.0 0.711546 0.436903 0.005135 0.687779 0.004649 1.0 1.001671 1.003460 1.007573 0.007703 NaN 0.007108 0.008261 NaN 0.007206 0.002409 NaN NaN NaN NaN NaN 0.006117 0.004516 0.003200 NaN 0.008419 0.006527 0.009600
4 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a 0 2017-07-16 0.947248 0.002483 0.015193 1.000727 0.007605 0.117325 0.004653 0.009312 NaN NaN 0.007731 0.038862 0.720619 0.143630 0.006667 0.231009 0.529305 NaN NaN 0.075900 0.048918 0.001199 0.154026 1.341735 0.000519 0.001362 0.205468 0.691986 0.121518 NaN 0.054221 0.009484 0.006698 1.003738 0.003846 0.094311 0.129539 0.917133 0.231110 0.154914 0.095178 0.001653 0.126443 0.039259 0.027897 0.063697 0.233103 0.175655 0.031171 0.487460 0.093915 CR O 0.009479 0.001714 NaN 0.650112 0.007773 NaN 0.005851 6.0 0.216507 0.008605 0.507712 0.006821 0.000152 0.000104 0.001419 0.096441 0.007972 0.106020 0.032733 NaN 0.008156 0.076561 0.074432 NaN 0.004933 0.004831 NaN 0.001833 0.005738 0.003289 0.002608 0.007338 NaN 0.007447 0.004465 0.006111 0.002246 0.002109 0.001141 0.506213 0.001770 1.000904 0.062492 0.005860 0.001845 0.007816 0.002470 0.005516 0.007166 NaN 0.0 0.000155 0.001504 NaN 0.002019 0.002678 NaN 1 0.007728 0.003432 0.002199 0.005511 0.004105 0.009656 1.006536 0.005158 0.003341 0.000264 1.509905 1.002915 0.003079 0.003845 0.007190 0.002983 0.000535 0.921026 0.131620 0.933479 0.978027 0.018200 0.822281 1.006125 1.005735 0.953363 NaN 0.673869 0.009998 0.017598 NaN 1.003175 2.0 NaN 0.009120 NaN NaN NaN 1.000130 0.120717 0.811199 0.008724 1.0 0.247939 0.0 4.0 0.244199 0.242325 0.0 0.705343 0.437433 0.002849 0.688774 0.000097 1.0 1.009886 1.005053 1.008132 0.009823 NaN 0.009680 0.004848 NaN 0.006312 0.004462 NaN NaN NaN NaN NaN 0.003671 0.004946 0.008889 NaN 0.001670 0.008126 0.009827
In [6]:
df.shape
Out[6]:
(1103628, 191)
In [7]:
df = df.sample(frac=0.7, random_state=42) 
In [8]:
df.shape
Out[8]:
(772540, 191)
In [9]:
df['customer_ID'].nunique()
Out[9]:
91149
In [10]:
# Re Organize the dataframe.
fixed_columns = ['customer_ID', 'S_2', 'target']    # since s_2 is the date column
other_columns = sorted([col for col in df.columns if col not in fixed_columns])

new_column_order = fixed_columns + other_columns
df = df[new_column_order]
In [11]:
from collections import Counter

initial_chars = [col[0] for col in df.columns if col[0].isalpha()]
initial_char_counts = Counter(initial_chars)

initial_char_counts_df = pd.DataFrame(list(initial_char_counts.items()), columns=['Initial_Char', 'Count'])

mapping = {
    'c': 'Customer ID',
    's': 'Spend Variables',
    't': 'Target',
    'B': 'Balance Variables',
    'D': 'Delinquency Variables',
    'P': 'Payment Variables',
    'R': 'Risk Variables',
}

initial_char_counts_df['Description'] = initial_char_counts_df['Initial_Char'].map(lambda x: mapping.get(x, 'Other'))
initial_char_counts_df
Out[11]:
Initial_Char Count Description
0 c 1 Customer ID
1 S 22 Other
2 t 1 Target
3 B 40 Balance Variables
4 D 96 Delinquency Variables
5 P 3 Payment Variables
6 R 28 Risk Variables
In [12]:
import plotly.graph_objects as go

# Creating the plot
fig = go.Figure()

fig.add_trace(go.Bar(x=initial_char_counts_df['Description'], 
                     y=initial_char_counts_df['Count'], 
                     marker_color='rgb(55, 83, 109)'))

fig.update_layout(title='Count of Columns by Description',
                  xaxis=dict(title='Description'),
                  yaxis=dict(title='Count'),
                 height=600)

fig.show()

Customer History Buckets¶

In [13]:
customer_counts = df['customer_ID'].value_counts()

counts_df = pd.DataFrame(customer_counts).reset_index()
counts_df.columns = ['c_id', 'counts']

# grouping by counts to calculate number of customers
count_summary = counts_df.groupby('counts')['c_id'].nunique().reset_index()

# renaming the columns
count_summary.columns = ['count_bucket', 'num_customers']

total_customers = count_summary['num_customers'].sum()

# Calculate the percentage of total for num_customers
count_summary['percent_of_total'] = (count_summary['num_customers'] / total_customers) * 100

# Show the DataFrame
count_summary
Out[13]:
count_bucket num_customers percent_of_total
0 1 1574 1.726843
1 2 1602 1.757562
2 3 1551 1.701609
3 4 1746 1.915545
4 5 2694 2.955600
5 6 5040 5.529408
6 7 9596 10.527817
7 8 15315 16.802159
8 9 19019 20.865835
9 10 17294 18.973329
10 11 10770 11.815818
11 12 4182 4.588092
12 13 766 0.840382

Dropping Customers with less than 3 months of data¶

In [14]:
customer_counts = df['customer_ID'].value_counts()

# Filter customer_counts to get IDs of customers who appear 3 times or more
customers_to_keep = customer_counts[customer_counts >= 3].index

# Filter the original DataFrame to keep only rows with these customer IDs
filtered_df = df[df['customer_ID'].isin(customers_to_keep)]



customer_counts = filtered_df['customer_ID'].value_counts()

counts_df = pd.DataFrame(customer_counts).reset_index()
counts_df.columns = ['c_id', 'counts']

# grouping by counts to calculate number of customers
count_summary = counts_df.groupby('counts')['c_id'].nunique().reset_index()

# renaming the columns
count_summary.columns = ['count_bucket', 'num_customers']

# show dataframe
count_summary
Out[14]:
count_bucket num_customers
0 3 1551
1 4 1746
2 5 2694
3 6 5040
4 7 9596
5 8 15315
6 9 19019
7 10 17294
8 11 10770
9 12 4182
10 13 766
In [15]:
df = filtered_df

Convert S_2 to datetime¶

In [16]:
df['S_2'] = pd.to_datetime(df['S_2'])
print(df['S_2'].dtype)
datetime64[ns]

Missing Value Analysis¶

In [17]:
# See if there any null values
v1 = df.isnull().sum()
v2 = ((df.isnull().sum()/len(df))*100).round(5)

[f'{percent:.2f}%' for percent in v2]

null_df = pd.DataFrame({'Null Value Count':v1,
                       'Null Value Percent':[f'{percent:.2f}%' for percent in v2]})
null_df.sort_values(by='Null Value Count', ascending=False)
Out[17]:
Null Value Count Null Value Percent
D_87 767174 99.92%
D_88 766977 99.90%
D_108 763651 99.46%
D_110 763449 99.44%
D_111 763449 99.44%
B_39 763137 99.40%
D_73 759973 98.99%
B_42 757425 98.65%
D_136 739740 96.35%
D_138 739740 96.35%
D_135 739740 96.35%
D_134 739740 96.35%
D_137 739740 96.35%
R_9 724561 94.37%
B_29 715278 93.16%
D_106 692648 90.22%
D_132 692482 90.19%
D_49 692046 90.14%
R_26 682101 88.84%
D_76 681881 88.81%
D_66 681104 88.71%
D_42 660687 86.05%
D_142 637449 83.03%
D_53 567437 73.91%
D_82 564019 73.46%
B_17 435034 56.66%
D_50 434760 56.63%
D_105 420609 54.78%
D_56 414084 53.93%
S_9 407525 53.08%
D_77 348458 45.39%
D_43 228077 29.71%
S_27 193536 25.21%
D_46 165914 21.61%
S_3 141795 18.47%
S_7 141795 18.47%
D_62 104847 13.66%
D_48 99703 12.99%
D_61 82682 10.77%
P_3 38415 5.00%
D_44 37758 4.92%
D_78 37758 4.92%
D_64 27859 3.63%
D_68 27737 3.61%
D_55 24939 3.25%
D_69 24586 3.20%
D_83 24586 3.20%
D_116 22082 2.88%
D_125 22082 2.88%
D_124 22082 2.88%
D_123 22082 2.88%
D_122 22082 2.88%
D_121 22082 2.88%
D_120 22082 2.88%
D_119 22082 2.88%
D_118 22082 2.88%
D_117 22082 2.88%
D_115 22082 2.88%
D_114 22082 2.88%
D_113 22082 2.88%
D_91 18905 2.46%
D_126 16140 2.10%
R_27 14530 1.89%
D_130 13288 1.73%
D_143 13288 1.73%
D_145 13288 1.73%
D_104 13288 1.73%
D_103 13288 1.73%
D_141 13288 1.73%
D_131 13288 1.73%
D_139 13288 1.73%
D_129 13288 1.73%
D_128 13288 1.73%
D_107 13288 1.73%
D_59 13107 1.71%
D_70 11896 1.55%
D_79 9786 1.27%
P_2 6123 0.80%
B_13 6089 0.79%
D_133 5858 0.76%
D_144 5575 0.73%
D_102 5569 0.73%
D_140 5566 0.72%
D_52 3875 0.50%
D_89 3875 0.50%
D_84 3875 0.50%
D_81 3271 0.43%
D_72 3000 0.39%
D_74 2764 0.36%
D_80 2764 0.36%
S_22 2588 0.34%
S_24 2520 0.33%
B_8 2509 0.33%
S_25 1935 0.25%
B_25 997 0.13%
B_15 997 0.13%
D_112 363 0.05%
B_20 269 0.04%
B_19 269 0.04%
B_2 269 0.04%
B_16 269 0.04%
D_45 269 0.04%
D_54 269 0.04%
B_22 269 0.04%
B_38 269 0.04%
B_26 269 0.04%
B_27 269 0.04%
D_41 269 0.04%
B_3 269 0.04%
B_30 269 0.04%
B_33 269 0.04%
D_109 197 0.03%
B_41 102 0.01%
S_26 94 0.01%
S_23 62 0.01%
R_20 11 0.00%
B_37 8 0.00%
R_12 8 0.00%
B_40 6 0.00%
B_6 6 0.00%
B_11 0 0.00%
R_4 0 0.00%
S_12 0 0.00%
R_21 0 0.00%
R_22 0 0.00%
R_23 0 0.00%
S_13 0 0.00%
R_24 0 0.00%
R_25 0 0.00%
B_12 0 0.00%
S_8 0 0.00%
target 0 0.00%
S_15 0 0.00%
R_28 0 0.00%
R_3 0 0.00%
R_5 0 0.00%
R_8 0 0.00%
S_6 0 0.00%
S_11 0 0.00%
S_5 0 0.00%
R_6 0 0.00%
R_7 0 0.00%
B_1 0 0.00%
B_10 0 0.00%
R_19 0 0.00%
S_20 0 0.00%
S_19 0 0.00%
S_18 0 0.00%
S_17 0 0.00%
S_16 0 0.00%
R_2 0 0.00%
B_5 0 0.00%
R_18 0 0.00%
D_39 0 0.00%
D_58 0 0.00%
B_21 0 0.00%
D_51 0 0.00%
B_23 0 0.00%
B_24 0 0.00%
S_2 0 0.00%
D_47 0 0.00%
B_28 0 0.00%
D_63 0 0.00%
B_31 0 0.00%
B_32 0 0.00%
B_36 0 0.00%
D_127 0 0.00%
B_4 0 0.00%
B_9 0 0.00%
B_7 0 0.00%
D_60 0 0.00%
D_65 0 0.00%
R_17 0 0.00%
P_4 0 0.00%
R_16 0 0.00%
R_15 0 0.00%
R_14 0 0.00%
R_13 0 0.00%
R_11 0 0.00%
R_10 0 0.00%
R_1 0 0.00%
D_96 0 0.00%
D_71 0 0.00%
D_94 0 0.00%
D_93 0 0.00%
D_92 0 0.00%
B_14 0 0.00%
D_86 0 0.00%
B_18 0 0.00%
D_75 0 0.00%
customer_ID 0 0.00%
In [18]:
### Dropping Columns with more than 20% nulls
# Dropping Columns with more than 20% Nulls
# calculate % if missing
# v2 = df.isnull().mean()*100

# # filter out columns with more than 20% nulls
# cols_to_drop = v2[v2>20].index

# # droppping these columns from the dataframe
# df.drop(columns=cols_to_drop, inplace=True)

# # recalculate nulls
# v1 = df.isnull().sum()
# v2 = ((df.isnull().sum()/len(df))*100).round(5)

# [f'{percent:.2f}%' for percent in v2]

# null_df2 = pd.DataFrame({'Null Value Count':v1,
#                        'Null Value Percent':[f'{percent:.2f}%' for percent in v2]})
# null_df2.sort_values(by='Null Value Count', ascending=False)

Recheck for Missing Values¶

In [20]:
# # See if there any null values
# v1 = df.isnull().sum()
# v2 = ((df.isnull().sum()/len(df))*100).round(5)

# [f'{percent:.2f}%' for percent in v2]

# null_df = pd.DataFrame({'Null Value Count':v1,
#                        'Null Value Percent':[f'{percent:.2f}%' for percent in v2]})
# null_df.sort_values(by='Null Value Count', ascending=False)
In [ ]:
 
In [ ]:
 

Imputing Categorical Columns with Mode and Numerical with Mean¶

In [21]:
categorical_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 
                       'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

for column in categorical_columns:
    df[column] = df[column].fillna(df[column].mode()[0])
    
numerical_columns = df.select_dtypes(include=[np.number]).columns

for column in numerical_columns:
    df[column] = df[column].fillna(df[column].median())
In [24]:
for col in categorical_columns:
    print(df[col].value_counts())
    print('\n')
B_30
0.0    653606
1.0    106573
2.0      7583
Name: count, dtype: int64


B_38
2.0    272273
3.0    174347
1.0    159901
5.0     62089
4.0     41485
7.0     35239
6.0     22428
Name: count, dtype: int64


D_114
1.0    482721
0.0    285041
Name: count, dtype: int64


D_116
0.0    766895
1.0       867
Name: count, dtype: int64


D_117
-1.0    225881
 3.0    161239
 4.0    158614
 2.0     93755
 5.0     63263
 6.0     48314
 1.0     16696
Name: count, dtype: int64


D_120
0.0    680865
1.0     86897
Name: count, dtype: int64


D_126
 1.0    608319
 0.0    122394
-1.0     37049
Name: count, dtype: int64


D_63
CO    572746
CR    128750
CL     60596
XZ      3388
XM      1287
XL       995
Name: count, dtype: int64


D_64
O     431797
U     213302
R     117473
-1      5190
Name: count, dtype: int64


D_66
1.0    766935
0.0       827
Name: count, dtype: int64


D_68
6.0    413529
5.0    168788
4.0     67194
3.0     66739
2.0     30538
1.0     18811
0.0      2163
Name: count, dtype: int64


Categorical Data Encoding¶

In [25]:
columns_to_encode = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']
cat_df_encoded = pd.get_dummies(df, columns=columns_to_encode, prefix=columns_to_encode, dtype=int)
In [26]:
new_columns = [col for col in cat_df_encoded.columns if col not in df.columns]

print(new_columns)
['B_30_0.0', 'B_30_1.0', 'B_30_2.0', 'B_38_1.0', 'B_38_2.0', 'B_38_3.0', 'B_38_4.0', 'B_38_5.0', 'B_38_6.0', 'B_38_7.0', 'D_114_0.0', 'D_114_1.0', 'D_116_0.0', 'D_116_1.0', 'D_117_-1.0', 'D_117_1.0', 'D_117_2.0', 'D_117_3.0', 'D_117_4.0', 'D_117_5.0', 'D_117_6.0', 'D_120_0.0', 'D_120_1.0', 'D_126_-1.0', 'D_126_0.0', 'D_126_1.0', 'D_63_CL', 'D_63_CO', 'D_63_CR', 'D_63_XL', 'D_63_XM', 'D_63_XZ', 'D_64_-1', 'D_64_O', 'D_64_R', 'D_64_U', 'D_68_0.0', 'D_68_1.0', 'D_68_2.0', 'D_68_3.0', 'D_68_4.0', 'D_68_5.0', 'D_68_6.0']
In [27]:
df2 = cat_df_encoded
In [28]:
df2.head()
Out[28]:
customer_ID S_2 target B_1 B_10 B_11 B_12 B_13 B_14 B_15 B_16 B_17 B_18 B_19 B_2 B_20 B_21 B_22 B_23 B_24 B_25 B_26 B_27 B_28 B_29 B_3 B_31 B_32 B_33 B_36 B_37 B_39 B_4 B_40 B_41 B_42 B_5 B_6 B_7 B_8 B_9 D_102 D_103 D_104 D_105 D_106 D_107 D_108 D_109 D_110 D_111 D_112 D_113 D_115 D_118 D_119 D_121 D_122 D_123 D_124 D_125 D_127 D_128 D_129 D_130 D_131 D_132 D_133 D_134 D_135 D_136 D_137 D_138 D_139 D_140 D_141 D_142 D_143 D_144 D_145 D_39 D_41 D_42 D_43 D_44 D_45 D_46 D_47 D_48 D_49 D_50 D_51 D_52 D_53 D_54 D_55 D_56 D_58 D_59 D_60 D_61 D_62 D_65 D_66 D_69 D_70 D_71 D_72 D_73 D_74 D_75 D_76 D_77 D_78 D_79 D_80 D_81 D_82 D_83 D_84 D_86 D_87 D_88 D_89 D_91 D_92 D_93 D_94 D_96 P_2 P_3 P_4 R_1 R_10 R_11 R_12 R_13 R_14 R_15 R_16 R_17 R_18 R_19 R_2 R_20 R_21 R_22 R_23 R_24 R_25 R_26 R_27 R_28 R_3 R_4 R_5 R_6 R_7 R_8 R_9 S_11 S_12 S_13 S_15 S_16 S_17 S_18 S_19 S_20 S_22 S_23 S_24 S_25 S_26 S_27 S_3 S_5 S_6 S_7 S_8 S_9 B_30_0.0 B_30_1.0 B_30_2.0 B_38_1.0 B_38_2.0 B_38_3.0 B_38_4.0 B_38_5.0 B_38_6.0 B_38_7.0 D_114_0.0 D_114_1.0 D_116_0.0 D_116_1.0 D_117_-1.0 D_117_1.0 D_117_2.0 D_117_3.0 D_117_4.0 D_117_5.0 D_117_6.0 D_120_0.0 D_120_1.0 D_126_-1.0 D_126_0.0 D_126_1.0 D_63_CL D_63_CO D_63_CR D_63_XL D_63_XM D_63_XZ D_64_-1 D_64_O D_64_R D_64_U D_68_0.0 D_68_1.0 D_68_2.0 D_68_3.0 D_68_4.0 D_68_5.0 D_68_6.0
1001678 e82426907e83eff6314d6932387e05980260b6a504a859378e40f6148de77e9b 2018-01-30 0 0.092478 0.024581 0.063496 0.006179 0.023176 0.022227 0.009837 0.758565 0.932517 0.287218 0.121374 0.222187 0.118112 0.007755 0.006110 0.273957 0.004579 0.039733 0.000247 0.002718 0.102830 0.005099 0.065400 1 0.009412 0.001723 0.000011 0.088204 0.14597 0.278010 0.310617 0.002795 0.022975 0.012549 0.021231 0.320102 1.000424 0.516240 0.424792 1.008174 0.960303 0.497643 0.136416 0.674171 0.005392 0.001766 0.891639 1.003717 1.007348 0.208180 0.037980 0.042318 0.033467 0.082271 0.294749 0.009710 0.006331 0.001693 0.009782 0.009530 0.004320 0.006810 0.000972 0.16179 0.008678 0.211991 0.005103 0.253976 0.005023 0.006987 0.002402 0.004553 0.002616 0.378180 0.004751 0.002337 0.007682 0.685093 0.006109 0.12183 0.116663 0.504001 0.047147 0.538970 0.080949 0.815861 0.13003 0.109446 0.008428 0.045970 0.013349 1.008537 0.548011 0.149113 0.505297 0.468230 0.092569 0.831179 0.022596 0.008995 1.0 0.005642 0.251570 0.013525 0.337655 0.102926 0.288860 0.334192 0.05855 0.205999 0.007066 0.006606 0.004961 0.005136 0.009484 0.005704 0.004489 0.004797 1.0 0.081778 0.006549 0.004516 0.001358 0.009264 0.001387 0.000945 0.423450 0.696847 0.009930 0.006237 0.004581 0.008294 1.000849 0.002636 0.001108 0.001155 0.000026 0.002033 0.004208 0.005400 0.007020 0.004571 0.001855 0.003639 0.003954 0.008042 0.001306 0.03644 1.009312 0.002486 0.201494 0.007820 0.000448 0.005406 0.007668 0.001542 0.172798 0.446841 0.190555 0.005894 0.801286 0.004133 0.003098 0.006537 0.002140 0.004594 0.949931 0.140650 0.963861 0.971872 0.009365 0.004759 0.415000 0.029318 0.001722 0.345205 0.108934 0.019381 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0
698107 a1ece17ade53583a522f0fec8c7e8c79085a24c9377bba1ff4fdce232b4da214 2017-08-30 1 0.069503 0.101049 0.070815 0.626662 0.365552 0.785919 0.922997 1.004107 0.915636 0.134597 0.086449 0.300279 1.007248 0.002313 0.503756 0.063787 0.958598 0.007392 0.009718 0.001148 0.397575 0.001723 0.430102 1 0.002739 0.008025 0.003982 0.063742 0.14597 0.174456 0.055480 0.007250 0.022975 0.001694 0.069223 0.063528 1.004163 0.007677 0.122184 1.004207 0.950343 0.071197 0.136416 0.339161 0.005392 0.001980 0.891639 1.003717 0.006020 0.200886 0.119490 0.114635 0.113965 0.399811 0.292976 0.008145 0.368086 0.002370 0.009576 0.004705 0.001926 0.007697 0.006051 0.16179 0.007495 0.211991 0.005103 0.253976 0.005023 0.006987 0.003982 0.009797 0.002361 0.378180 0.005402 0.004501 0.006795 0.800477 0.641640 0.12183 0.096170 0.125717 0.106280 0.481777 0.573031 0.046104 0.13003 0.109446 0.002313 0.481761 0.010404 1.003439 0.148661 0.149113 0.082128 0.316978 0.924212 0.007875 0.037178 0.000487 1.0 0.005549 0.253756 0.075213 0.009911 0.102926 0.078010 0.135256 0.05855 0.044626 0.509549 0.008299 0.001804 0.005779 0.505277 0.007095 0.501574 0.009669 1.0 0.081778 0.009226 0.009876 0.007885 0.009767 0.007391 0.008429 0.363639 0.269508 0.001070 0.001198 0.009938 0.008982 0.054798 0.003986 0.002093 0.006496 0.005303 0.007658 0.001471 0.002264 1.006851 0.000424 0.001323 0.001392 0.009447 0.001465 0.000372 0.03644 1.006273 0.005403 0.100891 0.006563 0.503488 0.010675 0.182436 1.007199 0.172798 0.648812 0.269872 1.000615 0.306887 0.006539 0.001750 1.001116 0.004646 0.002871 0.986852 0.881867 1.019099 0.891789 0.019339 0.320809 0.175189 0.704642 0.009834 0.473014 0.496838 0.020089 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0
451750 690f87ac1353a1ef667fa941e321477102811b642e71bda7ee31f31e0a324575 2017-03-18 1 0.550235 0.014174 0.489797 0.007269 0.006078 0.106068 0.001858 1.000686 0.985567 0.198068 1.007074 0.028153 1.008544 0.004195 0.505404 0.595276 0.000755 0.178551 0.004988 0.009959 0.087017 0.005099 0.907538 1 0.004282 0.006677 0.009813 0.552285 0.14597 0.303336 0.282492 0.008354 0.022975 0.016795 0.009934 0.691952 1.007928 0.645657 0.009052 0.003526 0.009925 0.333975 0.136416 0.007719 0.005392 0.007642 0.891639 1.003717 1.009590 0.200134 0.028547 0.281556 0.284744 0.233213 0.148046 0.004905 0.092844 0.001623 0.002265 0.001137 0.008589 0.005200 0.009643 0.16179 0.009668 0.211991 0.005103 0.253976 0.005023 0.006987 0.005393 0.002461 0.006307 0.378180 0.002981 0.001905 0.000468 0.000191 0.001918 0.12183 0.192925 0.255509 0.099420 0.486834 0.334910 0.757523 0.13003 0.109446 0.009742 0.011181 0.013349 1.003327 0.551844 0.149113 0.398534 0.238379 0.539913 0.935858 0.007404 0.007285 1.0 0.000732 0.506233 0.010650 0.006161 0.102926 0.218794 0.276046 0.05855 0.205999 0.009848 0.005611 0.002026 0.003054 0.504428 0.009692 0.009721 0.006141 1.0 0.081778 0.001494 0.007583 0.009587 0.003169 0.007167 0.001000 0.472415 0.412725 0.001032 0.004999 0.005112 0.001965 1.005624 0.004845 0.000833 0.002345 0.003151 0.007910 0.005808 0.007583 0.007328 0.006428 0.006203 0.001365 0.000948 0.006021 0.003025 0.03644 1.005733 0.003526 0.008849 0.003744 0.000786 0.001697 0.006504 0.001919 0.172798 0.284922 0.188889 0.007357 0.506795 0.006982 0.004230 0.003931 0.002829 0.009914 0.970445 0.686452 0.960000 0.684947 0.001207 0.009092 0.369557 0.050117 0.007323 0.273249 0.000872 0.019381 0 1 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
336998 4e41c8de6ad791f70548dd1e3f5dd7641885180c240e57aa1122aab230ddf9e6 2017-08-28 0 0.001704 0.295186 0.009168 0.059050 0.015061 0.004249 0.003515 0.009922 0.932517 1.003641 0.002695 0.812935 0.005447 0.004822 0.004476 0.026788 0.002857 0.002045 0.004380 0.006927 0.065456 0.005099 0.009482 1 0.004631 1.001611 0.005051 0.007671 0.14597 0.059279 0.036768 0.005562 0.022975 0.034604 0.177883 0.033266 0.003200 0.001994 0.000987 0.006254 0.001904 0.333975 0.136416 0.004313 0.005392 0.009597 0.891639 1.003717 1.004763 0.208888 0.836221 0.818625 0.819371 0.732808 0.430107 0.000682 0.138326 0.002661 0.005841 1.004648 1.004891 0.008831 0.003620 0.16179 0.008020 0.211991 0.005103 0.253976 0.005023 0.006987 0.005000 0.001946 0.001037 0.378180 0.004152 0.006598 0.008179 0.007660 0.004461 0.12183 0.088647 0.001576 0.317622 0.423614 0.668637 0.108188 0.13003 0.290106 0.004070 0.228856 0.013349 1.001418 0.148305 0.115426 0.003264 0.170666 0.073078 0.107157 0.503322 0.003896 1.0 0.007260 0.007641 0.094936 0.007068 0.102926 0.003147 0.009368 0.05855 0.467864 0.004748 0.008386 0.001245 0.006583 0.504227 0.005105 0.009094 0.002025 1.0 0.081778 0.002630 0.002007 0.008523 0.003001 0.005839 0.008463 1.004331 0.623208 0.004379 0.008625 0.000019 0.001013 1.004802 0.000863 0.003678 0.005060 0.002457 0.005356 0.002545 0.005371 0.007462 0.008011 0.008939 0.006929 0.003677 0.005429 0.002318 0.03644 1.008289 0.000226 0.002526 0.005590 0.005532 0.005212 0.007322 0.003281 0.172798 0.285991 0.186971 0.004755 0.508953 0.001270 0.009659 0.008348 0.001551 0.002803 0.297968 0.139452 0.080747 0.974545 0.001003 0.287958 0.163977 0.008608 1.005079 0.139884 0.004941 0.019381 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1
446688 67de95d0e4bb392f2384d45f5d1944269de0cb37e5f2b77ceb7107c6ef18c05a 2017-10-09 1 0.277471 0.018577 0.231653 0.014207 0.025692 0.066600 0.008354 1.007376 1.001243 0.209912 0.368584 0.083958 0.948962 0.007272 0.009839 0.250800 0.005238 0.116631 0.001909 0.005859 0.087666 0.005099 0.361439 1 0.003556 0.003434 0.005899 0.278873 0.14597 0.196832 0.304891 0.000061 0.022975 0.004144 0.010874 0.296214 1.008416 0.482246 0.003631 0.002465 0.003741 0.333975 0.136416 0.007364 0.005392 0.006433 0.891639 1.003717 0.005043 0.203524 0.059302 0.061084 0.055094 0.145271 0.434661 0.001164 0.050673 0.008965 0.008984 1.003624 1.004067 0.003678 0.003218 0.16179 0.002409 0.211991 0.005103 0.253976 0.005023 0.006987 1.006120 0.002359 0.931348 0.373401 1.005839 0.239530 0.459985 0.324037 0.285931 0.12183 0.078766 0.009928 0.040742 0.450376 0.134951 0.509138 0.13003 0.067773 0.003925 0.087683 0.003649 1.006069 0.579490 0.104387 0.164118 0.420743 0.365058 0.611261 0.026196 0.004948 1.0 0.003961 0.007824 0.004159 0.003793 0.102926 0.074460 0.138262 0.05855 0.205999 0.006376 0.006612 0.009926 0.007995 0.504428 0.009803 0.006562 0.009056 1.0 0.081778 0.008971 0.009339 0.000474 0.003738 0.006572 0.003103 0.543514 0.897214 0.004518 0.004120 0.005425 0.006901 1.008179 0.003851 0.006478 0.004763 0.009795 0.001420 0.008240 0.000049 0.002438 0.001948 0.007413 0.004663 0.007063 0.004741 0.005064 0.03644 1.006411 0.007799 0.005209 0.005652 0.008214 0.009816 0.003596 0.003489 0.172798 0.329063 0.189788 0.286854 0.400545 0.005367 0.004743 0.000409 0.003395 0.004220 0.953209 0.131797 0.950913 0.974243 0.005657 0.009277 0.179880 0.035951 0.000421 0.265130 0.315731 0.019381 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
In [ ]:
 
In [ ]:
 

Feature Engineering¶

In [29]:
df2['S_2'] = pd.to_datetime(df2['S_2'])
end_date = df2['S_2'].max()

# Date ranges
last_3_months = end_date - pd.DateOffset(months=3)
last_6_months = end_date - pd.DateOffset(months=6)
last_9_months = end_date - pd.DateOffset(months=9)
last_12_months = end_date - pd.DateOffset(months=12)

# Exclusion categorical columns
exclusion_list = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68', 'S_2']
columns_to_include = [col for col in df2.columns if not col.startswith(tuple(exclusion_list))]

# New features
df2['S_Total'] = df2[columns_to_include].filter(like='S_').sum(axis=1)
df2['P_Total'] = df2[columns_to_include].filter(like='P_').sum(axis=1)
df2['B_Total'] = df2[columns_to_include].filter(like='B_').sum(axis=1)
df2['R_Total'] = df2[columns_to_include].filter(like='R_').sum(axis=1)

df2['D_Ave'] = df2[columns_to_include].filter(like='D_').mean(axis=1)
df2['S_Ave'] = df2[columns_to_include].filter(like='S_').mean(axis=1)
df2['P_Ave'] = df2[columns_to_include].filter(like='P_').mean(axis=1)
df2['B_Ave'] = df2[columns_to_include].filter(like='B_').mean(axis=1)
df2['R_Ave'] = df2[columns_to_include].filter(like='R_').mean(axis=1)

df2['D_Ave'] = df2[columns_to_include].filter(like='D_').mean(axis=1)
df2['S_Ave'] = df2[columns_to_include].filter(like='S_').mean(axis=1)
df2['P_Ave'] = df2[columns_to_include].filter(like='P_').mean(axis=1)
df2['B_Ave'] = df2[columns_to_include].filter(like='B_').mean(axis=1)
df2['R_Ave'] = df2[columns_to_include].filter(like='R_').mean(axis=1)


df2['S_Ave_3_months'] = df2[columns_to_include].filter(like='S_').loc[df2['S_2'] >= last_3_months].mean(axis=1)
df2['S_Ave_6_months'] = df2[columns_to_include].filter(like='S_').loc[df2['S_2'] >= last_6_months].mean(axis=1)
df2['S_Ave_9_months'] = df2[columns_to_include].filter(like='S_').loc[df2['S_2'] >= last_9_months].mean(axis=1)
df2['S_Ave_12_months'] = df2[columns_to_include].filter(like='S_').loc[df2['S_2'] >= last_12_months].mean(axis=1)

df2['P_Ave_3_months'] = df2[columns_to_include].filter(like='P_').loc[df2['S_2'] >= last_3_months].mean(axis=1)
df2['P_Ave_6_months'] = df2[columns_to_include].filter(like='P_').loc[df2['S_2'] >= last_6_months].mean(axis=1)
df2['P_Ave_9_months'] = df2[columns_to_include].filter(like='P_').loc[df2['S_2'] >= last_9_months].mean(axis=1)
df2['P_Ave_12_months'] = df2[columns_to_include].filter(like='P_').loc[df2['S_2'] >= last_12_months].mean(axis=1)

df2['R_Ave_3_months'] = df2[columns_to_include].filter(like='R_').loc[df2['S_2'] >= last_3_months].mean(axis=1)
df2['R_Ave_6_months'] = df2[columns_to_include].filter(like='R_').loc[df2['S_2'] >= last_6_months].mean(axis=1)
df2['R_Ave_9_months'] = df2[columns_to_include].filter(like='R_').loc[df2['S_2'] >= last_9_months].mean(axis=1)
df2['R_Ave_12_months'] = df2[columns_to_include].filter(like='R_').loc[df2['S_2'] >= last_12_months].mean(axis=1)

df2['B_Ave_3_months'] = df2[columns_to_include].filter(like='B_').loc[df2['S_2'] >= last_3_months].mean(axis=1)
df2['B_Ave_6_months'] = df2[columns_to_include].filter(like='B_').loc[df2['S_2'] >= last_6_months].mean(axis=1)
df2['B_Ave_9_months'] = df2[columns_to_include].filter(like='B_').loc[df2['S_2'] >= last_9_months].mean(axis=1)
df2['B_Ave_12_months'] = df2[columns_to_include].filter(like='B_').loc[df2['S_2'] >= last_12_months].mean(axis=1)

df2['D_Ave_3_months'] = df2[columns_to_include].filter(like='D_').loc[df2['S_2'] >= last_3_months].mean(axis=1)
df2['D_Ave_6_months'] = df2[columns_to_include].filter(like='D_').loc[df2['S_2'] >= last_6_months].mean(axis=1)
df2['D_Ave_9_months'] = df2[columns_to_include].filter(like='D_').loc[df2['S_2'] >= last_9_months].mean(axis=1)
df2['D_Ave_12_months'] = df2[columns_to_include].filter(like='D_').loc[df2['S_2'] >= last_12_months].mean(axis=1)
In [30]:
print(list(df2.columns))
['customer_ID', 'S_2', 'target', 'B_1', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_2', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_26', 'B_27', 'B_28', 'B_29', 'B_3', 'B_31', 'B_32', 'B_33', 'B_36', 'B_37', 'B_39', 'B_4', 'B_40', 'B_41', 'B_42', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'D_102', 'D_103', 'D_104', 'D_105', 'D_106', 'D_107', 'D_108', 'D_109', 'D_110', 'D_111', 'D_112', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_127', 'D_128', 'D_129', 'D_130', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144', 'D_145', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_65', 'D_66', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_75', 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_81', 'D_82', 'D_83', 'D_84', 'D_86', 'D_87', 'D_88', 'D_89', 'D_91', 'D_92', 'D_93', 'D_94', 'D_96', 'P_2', 'P_3', 'P_4', 'R_1', 'R_10', 'R_11', 'R_12', 'R_13', 'R_14', 'R_15', 'R_16', 'R_17', 'R_18', 'R_19', 'R_2', 'R_20', 'R_21', 'R_22', 'R_23', 'R_24', 'R_25', 'R_26', 'R_27', 'R_28', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8', 'R_9', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', 'S_17', 'S_18', 'S_19', 'S_20', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_9', 'B_30_0.0', 'B_30_1.0', 'B_30_2.0', 'B_38_1.0', 'B_38_2.0', 'B_38_3.0', 'B_38_4.0', 'B_38_5.0', 'B_38_6.0', 'B_38_7.0', 'D_114_0.0', 'D_114_1.0', 'D_116_0.0', 'D_116_1.0', 'D_117_-1.0', 'D_117_1.0', 'D_117_2.0', 'D_117_3.0', 'D_117_4.0', 'D_117_5.0', 'D_117_6.0', 'D_120_0.0', 'D_120_1.0', 'D_126_-1.0', 'D_126_0.0', 'D_126_1.0', 'D_63_CL', 'D_63_CO', 'D_63_CR', 'D_63_XL', 'D_63_XM', 'D_63_XZ', 'D_64_-1', 'D_64_O', 'D_64_R', 'D_64_U', 'D_68_0.0', 'D_68_1.0', 'D_68_2.0', 'D_68_3.0', 'D_68_4.0', 'D_68_5.0', 'D_68_6.0', 'S_Total', 'P_Total', 'B_Total', 'R_Total', 'D_Ave', 'S_Ave', 'P_Ave', 'B_Ave', 'R_Ave', 'S_Ave_3_months', 'S_Ave_6_months', 'S_Ave_9_months', 'S_Ave_12_months', 'P_Ave_3_months', 'P_Ave_6_months', 'P_Ave_9_months', 'P_Ave_12_months', 'R_Ave_3_months', 'R_Ave_6_months', 'R_Ave_9_months', 'R_Ave_12_months', 'B_Ave_3_months', 'B_Ave_6_months', 'B_Ave_9_months', 'B_Ave_12_months', 'D_Ave_3_months', 'D_Ave_6_months', 'D_Ave_9_months', 'D_Ave_12_months']
In [ ]:
 

Split the DataFrame¶

In [31]:
X = df2.drop(['customer_ID', 'target', 'S_2'], axis=1)
y = df2['target']

from sklearn.model_selection import train_test_split

# split 1
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
# split 2
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_temp, y_temp, test_size=0.5, random_state=59)
In [32]:
print(X_train.shape)
print(y_train.shape)
print(X_test1.shape)
print(y_test1.shape)
print(X_test2.shape)
print(y_test2.shape)
(537433, 250)
(537433,)
(115164, 250)
(115164,)
(115165, 250)
(115165,)
In [33]:
# X_train.to_csv('X_train.csv', index=False)
# y_train.to_csv('y_train.csv', index=False)

# X_test1.to_csv('X_test1.csv', index=False)
# y_test1.to_csv('y_test1.csv', index=False)

# X_test2.to_csv('X_test2.csv', index=False)
# y_test2.to_csv('y_test2.csv', index=False)

XGBoost Model 1¶

In [34]:
from xgboost import XGBClassifier

xgb_m1 = XGBClassifier(random_state=1, use_label_encoder=False, seed=69)
xgb_m1.fit(X_train, y_train)
Out[34]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=1, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=1, ...)
In [36]:
# Get feature importances
feature_importances = xgb_m1.feature_importances_

# Print feature importance for features with importance > 0.5%
print("Features with importance higher than 0.5%:")
for feature_name, importance in zip(X_train.columns, feature_importances):
    if importance > 0.005:
        print(f'Feature: {feature_name}, Importance: {importance}')
Features with importance higher than 0.5%:
Feature: B_1, Importance: 0.062093429267406464
Feature: B_10, Importance: 0.005027878098189831
Feature: B_2, Importance: 0.019205378368496895
Feature: B_3, Importance: 0.011666232720017433
Feature: B_4, Importance: 0.005044857505708933
Feature: B_7, Importance: 0.010539853014051914
Feature: B_9, Importance: 0.02490614727139473
Feature: D_112, Importance: 0.005575456190854311
Feature: D_129, Importance: 0.005559643264859915
Feature: D_41, Importance: 0.00764946173876524
Feature: D_42, Importance: 0.018204940482974052
Feature: D_43, Importance: 0.006187329534441233
Feature: D_44, Importance: 0.0065676490776240826
Feature: D_45, Importance: 0.008879962377250195
Feature: D_46, Importance: 0.007054412737488747
Feature: D_48, Importance: 0.011242859996855259
Feature: D_51, Importance: 0.010159661062061787
Feature: D_75, Importance: 0.006474910769611597
Feature: D_79, Importance: 0.007479172199964523
Feature: P_2, Importance: 0.22402159869670868
Feature: R_1, Importance: 0.01718958094716072
Feature: R_27, Importance: 0.01262607891112566
Feature: S_23, Importance: 0.007821113802492619
Feature: S_3, Importance: 0.011532943695783615
Feature: B_38_2.0, Importance: 0.012651875615119934
Feature: B_38_4.0, Importance: 0.006722653284668922
Feature: B_Total, Importance: 0.006749620195478201
Feature: R_Total, Importance: 0.013241227716207504
Feature: B_Ave, Importance: 0.007870757952332497
Feature: R_Ave, Importance: 0.05340268090367317
Feature: R_Ave_12_months, Importance: 0.005185040645301342
Feature: B_Ave_3_months, Importance: 0.005713469814509153
Feature: B_Ave_6_months, Importance: 0.006645355373620987
In [37]:
feature_importances = xgb_m1.feature_importances_

# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

# Filter the DataFrame for features with importance > 0.5%
important_features_df = feature_importance_df[feature_importance_df['Importance'] > 0.005]

# Sort the DataFrame based on the importance in descending order
important_features_df = important_features_df.sort_values(by='Importance', ascending=False)

# print("Features with importance higher than 0.5%:")
# print(important_features_df)

# Plotting
plt.figure(figsize=(10, 8))  # Adjust the figure size as necessary
sns.barplot(x='Importance', y='Feature', data=important_features_df.sort_values('Importance', ascending=False))

plt.title('Feature Importance (>0.5%)')
plt.xlabel('Importance')
plt.ylabel('Features')

plt.show()
No description has been provided for this image
In [38]:
important_features_1 = []

for feature_name, importance in zip(X_train.columns, feature_importances):
    if importance > 0.005:  # Check if feature importance is greater than 0.5%
        important_features_1.append(feature_name)

print("Important features:", important_features_1)
Important features: ['B_1', 'B_10', 'B_2', 'B_3', 'B_4', 'B_7', 'B_9', 'D_112', 'D_129', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_48', 'D_51', 'D_75', 'D_79', 'P_2', 'R_1', 'R_27', 'S_23', 'S_3', 'B_38_2.0', 'B_38_4.0', 'B_Total', 'R_Total', 'B_Ave', 'R_Ave', 'R_Ave_12_months', 'B_Ave_3_months', 'B_Ave_6_months']

XGBoost Model 2¶

In [39]:
xgb_m2 = XGBClassifier(
    n_estimators = 300,
    learning_rate = 0.5,
    max_depth = 4,
    subsample=0.5,  # Use 50% of observations to build each tree
    colsample_bytree=0.5,  # Use 50% of features to build each tree
    scale_pos_weight=5,  # Assign a weight of 5 to default observations
    eval_metric='logloss',
    use_label_encoder=False,
    seed=42
)

xgb_m2.fit(X_train, y_train)
Out[39]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.5, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=300,
              n_jobs=None, num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.5, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=300,
              n_jobs=None, num_parallel_tree=None, random_state=None, ...)
In [40]:
# Get feature importances
feature_importances_2 = xgb_m2.feature_importances_

# Print feature importance for features with importance > 0.5%
print("Features with importance higher than 0.5%:")
for feature_name, importance in zip(X_train.columns, feature_importances_2):
    if importance > 0.005:
        print(f'Feature: {feature_name}, Importance: {importance}')
Features with importance higher than 0.5%:
Feature: B_1, Importance: 0.009449629113078117
Feature: B_3, Importance: 0.007355320733040571
Feature: B_37, Importance: 0.12651139497756958
Feature: B_9, Importance: 0.02417602762579918
Feature: D_132, Importance: 0.006089573726058006
Feature: D_41, Importance: 0.0075592705979943275
Feature: D_42, Importance: 0.03347364068031311
Feature: D_43, Importance: 0.005484941881150007
Feature: D_45, Importance: 0.007913424633443356
Feature: D_48, Importance: 0.1390070617198944
Feature: D_49, Importance: 0.005143444053828716
Feature: D_51, Importance: 0.007368069142103195
Feature: D_52, Importance: 0.009602224454283714
Feature: D_75, Importance: 0.016339194029569626
Feature: P_2, Importance: 0.06553985178470612
Feature: R_26, Importance: 0.007541041821241379
Feature: S_3, Importance: 0.012038648128509521
Feature: S_7, Importance: 0.010565096512436867
Feature: P_Total, Importance: 0.011251780204474926
Feature: B_Ave, Importance: 0.006103881634771824
Feature: R_Ave, Importance: 0.025396572425961494
Feature: S_Ave_6_months, Importance: 0.005313452798873186
In [41]:
feature_importances_2 = xgb_m2.feature_importances_

# Create a DataFrame for feature importances
feature_importance_df_2 = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances_2
})

# Filter the DataFrame for features with importance > 0.5%
important_features_df_2 = feature_importance_df_2[feature_importance_df['Importance'] > 0.005]

# Sort the DataFrame based on the importance in descending order
important_features_df_2 = important_features_df_2.sort_values(by='Importance', ascending=False)

# print("Features with importance higher than 0.5%:")
# print(important_features_df_2)

# Plotting
plt.figure(figsize=(10, 8))  # Adjust the figure size as necessary
sns.barplot(x='Importance', y='Feature', data=important_features_df_2.sort_values('Importance', ascending=False))

plt.title('Feature Importance (>0.5%)')
plt.xlabel('Importance')
plt.ylabel('Features')

plt.show()
No description has been provided for this image
In [42]:
important_features_2 = []

for feature_name, importance in zip(X_train.columns, feature_importances_2):
    if importance > 0.005:  # Check if feature importance is greater than 0.5%
        important_features_2.append(feature_name)

print("Important features:", important_features_2)
Important features: ['B_1', 'B_3', 'B_37', 'B_9', 'D_132', 'D_41', 'D_42', 'D_43', 'D_45', 'D_48', 'D_49', 'D_51', 'D_52', 'D_75', 'P_2', 'R_26', 'S_3', 'S_7', 'P_Total', 'B_Ave', 'R_Ave', 'S_Ave_6_months']

Consolidate Most Important Features from Both the Models¶

In [43]:
print(important_features_1)
print('\n')
print(important_features_2)
print('\n')


# get a consolidated list of all these features:
unique_elements = set(important_features_1).union(set(important_features_2))

features = list(unique_elements)

print(features)
['B_1', 'B_10', 'B_2', 'B_3', 'B_4', 'B_7', 'B_9', 'D_112', 'D_129', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_48', 'D_51', 'D_75', 'D_79', 'P_2', 'R_1', 'R_27', 'S_23', 'S_3', 'B_38_2.0', 'B_38_4.0', 'B_Total', 'R_Total', 'B_Ave', 'R_Ave', 'R_Ave_12_months', 'B_Ave_3_months', 'B_Ave_6_months']


['B_1', 'B_3', 'B_37', 'B_9', 'D_132', 'D_41', 'D_42', 'D_43', 'D_45', 'D_48', 'D_49', 'D_51', 'D_52', 'D_75', 'P_2', 'R_26', 'S_3', 'S_7', 'P_Total', 'B_Ave', 'R_Ave', 'S_Ave_6_months']


['D_52', 'B_Ave_3_months', 'D_112', 'B_38_2.0', 'B_2', 'R_Ave', 'R_Ave_12_months', 'S_7', 'R_1', 'B_38_4.0', 'R_26', 'B_7', 'D_41', 'P_2', 'D_129', 'D_132', 'B_3', 'D_49', 'D_45', 'D_44', 'B_1', 'D_46', 'B_37', 'P_Total', 'B_Total', 'D_51', 'S_Ave_6_months', 'D_75', 'R_27', 'B_9', 'B_Ave_6_months', 'D_79', 'D_48', 'S_23', 'D_43', 'B_4', 'B_10', 'B_Ave', 'R_Total', 'S_3', 'D_42']
In [ ]:
 

Setting Aside the Data For Strategy¶

In [47]:
columns_to_include = ["customer_ID", "target", "S_2"] + features

# Create df3 by selecting the specified columns from df2
df3 = df2[columns_to_include]
In [ ]:
 

Recreate Training and Testing Data to Include Only the above Features¶

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

X = df3.drop(['customer_ID','S_2','target'], axis=1)
y = df3['target']

X_train, X_test1, y_train, y_test1 = train_test_split(X, y, test_size=0.3, random_state=42)
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test1, y_test1, test_size=0.5, random_state=42)
In [49]:
print(X_train.shape)
print(y_train.shape)
print(X_test1.shape)
print(y_test1.shape)
print(X_test2.shape)
print(y_test2.shape)
(537433, 41)
(537433,)
(115164, 41)
(115164,)
(115165, 41)
(115165,)

XGBoost GridSearch¶

In [50]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

param_grid = {
    'n_estimators': [50, 100, 300],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.5, 0.8],  # % of obs each tree
    'colsample_bytree': [0.5, 1.0],  # % of features each tree
    'scale_pos_weight': [1, 5, 10]  # Weight of default observations
}

xgb_clf = XGBClassifier(use_label_encoder=False, seed=4)

grid_search_xgb = GridSearchCV(estimator=xgb_clf, 
                               param_grid=param_grid, 
                               scoring='roc_auc', 
                               cv=3, verbose=3)
grid_search_xgb.fit(X_train, y_train)

# best_model = grid_search_xgb.best_estimator_yes
Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.926 total time=   2.5s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.927 total time=   2.7s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.928 total time=   2.6s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.926 total time=   2.4s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.927 total time=   2.5s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.928 total time=   2.8s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.925 total time=   2.5s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.926 total time=   3.3s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.927 total time=   2.7s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.925 total time=   2.9s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.926 total time=   2.9s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.927 total time=   2.7s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.924 total time=   2.6s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.925 total time=   2.6s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.926 total time=   2.5s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.924 total time=   2.6s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.925 total time=   2.5s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.926 total time=   2.6s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.928 total time=   3.9s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.929 total time=   3.9s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.930 total time=   3.9s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.928 total time=   3.8s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.929 total time=   3.9s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.930 total time=   4.0s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.927 total time=   4.0s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.927 total time=   4.1s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.929 total time=   3.9s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.927 total time=   4.0s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.927 total time=   4.0s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.928 total time=   4.0s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.926 total time=   4.0s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.927 total time=   4.0s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.928 total time=   4.0s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.926 total time=   4.1s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.927 total time=   3.9s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.928 total time=   4.0s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.933 total time=  10.4s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.933 total time=   9.3s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.934 total time=   9.2s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.933 total time=   9.3s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.933 total time=   9.2s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.934 total time=   9.2s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.932 total time=   9.6s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.932 total time=   9.9s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.934 total time=   9.5s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.932 total time=   9.6s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.932 total time=   9.4s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.934 total time=   9.5s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.931 total time=   9.6s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.932 total time=   9.6s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.933 total time=   9.7s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.931 total time=   9.6s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.932 total time=   9.6s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.933 total time=   9.5s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.934 total time=   2.5s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.935 total time=   2.5s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.936 total time=   2.5s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.934 total time=   2.5s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.935 total time=   2.7s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.936 total time=   2.5s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.934 total time=   2.6s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.935 total time=   2.7s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.936 total time=   2.6s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.934 total time=   3.2s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.935 total time=   2.7s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.936 total time=   2.8s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.934 total time=   3.2s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.934 total time=   2.6s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.935 total time=   2.6s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.934 total time=   2.5s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.935 total time=   2.5s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.936 total time=   2.6s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.937 total time=   3.7s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.937 total time=   3.7s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.938 total time=   4.3s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.937 total time=   3.7s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.938 total time=   3.7s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.939 total time=   3.8s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.936 total time=   3.7s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.937 total time=   3.7s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.938 total time=   3.7s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.937 total time=   3.7s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.938 total time=   3.8s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.938 total time=   3.7s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.936 total time=   3.7s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.937 total time=   3.7s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.938 total time=   3.6s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.936 total time=   3.7s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.937 total time=   3.7s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.938 total time=   3.7s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.939 total time=   8.2s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.940 total time=   9.2s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.941 total time=   9.9s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.940 total time=   8.0s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.941 total time=   7.8s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.942 total time=   9.7s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.938 total time=   8.2s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.939 total time=   8.1s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.940 total time=   8.3s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.939 total time=   8.1s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.940 total time=   8.0s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.941 total time=   8.0s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.938 total time=   8.2s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.939 total time=   9.4s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.939 total time=   8.3s
[CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.939 total time=   8.1s
[CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.940 total time=   8.0s
[CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.940 total time=   8.2s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.923 total time=   2.7s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.924 total time=   2.6s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.925 total time=   2.6s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.922 total time=   2.6s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.923 total time=   2.6s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.925 total time=   2.6s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.922 total time=   2.8s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.923 total time=   3.0s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.924 total time=   2.7s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.921 total time=   2.8s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.922 total time=   2.7s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.923 total time=   2.7s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.921 total time=   2.7s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.921 total time=   2.7s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.923 total time=   2.7s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.920 total time=   2.7s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.921 total time=   2.7s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.922 total time=   2.6s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.926 total time=   4.1s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.927 total time=   4.3s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.928 total time=   3.8s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.925 total time=   3.9s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.926 total time=   3.8s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.927 total time=   3.8s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.924 total time=   3.9s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.925 total time=   3.9s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.926 total time=   4.0s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.924 total time=   3.9s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.925 total time=   3.9s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.926 total time=   3.9s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.923 total time=   3.9s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.924 total time=   4.0s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.926 total time=   3.9s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.923 total time=   3.9s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.924 total time=   4.7s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.925 total time=   4.1s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.932 total time=   9.2s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.933 total time=   9.0s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.934 total time=   9.0s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.932 total time=   9.0s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.933 total time=   8.9s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.934 total time=   9.3s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.931 total time=   9.3s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.932 total time=   9.3s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.933 total time=   9.9s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.931 total time=   9.2s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.932 total time=   9.3s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.933 total time=   9.4s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.931 total time=   9.6s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.932 total time=   9.5s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.933 total time=  10.1s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.931 total time=  10.2s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.932 total time=  10.2s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.933 total time=   9.3s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.934 total time=   2.5s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.935 total time=   2.5s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.936 total time=   2.5s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.934 total time=   2.5s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.935 total time=   2.4s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.936 total time=   2.5s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.934 total time=   2.5s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.935 total time=   2.5s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.936 total time=   2.5s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.934 total time=   2.5s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.935 total time=   2.5s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.936 total time=   2.5s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.934 total time=   2.5s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.935 total time=   2.6s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.936 total time=   2.5s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.934 total time=   2.6s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.935 total time=   2.5s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.936 total time=   2.5s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.937 total time=   3.6s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.938 total time=   3.7s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.939 total time=   3.6s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.937 total time=   3.6s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.938 total time=   3.9s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.939 total time=   3.7s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.936 total time=   3.6s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.937 total time=   3.6s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.938 total time=   3.6s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.937 total time=   3.6s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.938 total time=   3.7s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.939 total time=   3.6s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.936 total time=   3.6s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.937 total time=   3.6s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.938 total time=   3.6s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.937 total time=   3.6s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.937 total time=   3.6s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.938 total time=   3.6s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.939 total time=   7.9s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.940 total time=   8.4s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.941 total time=   8.0s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.940 total time=   7.8s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.941 total time=   7.7s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.942 total time=   7.8s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.938 total time=   8.0s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.940 total time=   7.9s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.940 total time=   8.0s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.940 total time=   8.7s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.940 total time=   7.8s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.941 total time=   7.8s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.938 total time=   8.1s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.939 total time=   8.5s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.940 total time=   8.0s
[CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.939 total time=   7.9s
[CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.940 total time=   7.7s
[CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.941 total time=   7.7s
Out[50]:
GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     grow_policy=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     multi_strategy=None, n_estimators=None,
                                     n_jobs=None, num_parallel_tree=None,
                                     random_state=None, ...),
             param_grid={'colsample_bytree': [0.5, 1.0],
                         'learning_rate': [0.01, 0.1],
                         'n_estimators': [50, 100, 300],
                         'scale_pos_weight': [1, 5, 10],
                         'subsample': [0.5, 0.8]},
             scoring='roc_auc', verbose=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     grow_policy=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     multi_strategy=None, n_estimators=None,
                                     n_jobs=None, num_parallel_tree=None,
                                     random_state=None, ...),
             param_grid={'colsample_bytree': [0.5, 1.0],
                         'learning_rate': [0.01, 0.1],
                         'n_estimators': [50, 100, 300],
                         'scale_pos_weight': [1, 5, 10],
                         'subsample': [0.5, 0.8]},
             scoring='roc_auc', verbose=3)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
In [52]:
from sklearn.metrics import roc_auc_score
import pandas as pd
from xgboost import XGBClassifier

# Initialize an empty DataFrame to store results
results_df_xgb = pd.DataFrame(columns=['n_estimators', 'learning_rate', 'Subsample %', 'Features',
                                       '% Weight of Default', 'AUC Train', 'AUC Test 1',
                                       'AUC Test 2'])

# Iterate over each combination of settings
for i, params in enumerate(grid_search_xgb.cv_results_['params']):
    
    # Initialize the model with current parameters
    model = XGBClassifier(use_label_encoder=False, seed=4, **params)
    
    # Fit the model with the current parameter combination
    model.fit(X_train, y_train)  # Ensure to include eval_metric to avoid warnings
    
    # Predict probabilities for each set
    train_pred = model.predict_proba(X_train)[:, 1]
    test1_pred = model.predict_proba(X_test1)[:, 1]
    test2_pred = model.predict_proba(X_test2)[:, 1]
    
    # Calculate AUC for each set
    auc_train = roc_auc_score(y_train, train_pred)
    auc_test1 = roc_auc_score(y_test1, test1_pred)
    auc_test2 = roc_auc_score(y_test2, test2_pred)
    
    # Create a temporary DataFrame with the results
    temp_df = pd.DataFrame({
        'n_estimators': [params.get('n_estimators', 'Not specified')],
        'learning_rate': [params.get('learning_rate', 'Not specified')],
        'Subsample %': [f"{params.get('subsample', 0) * 100}%"],
        'Features': [f"{params.get('colsample_bytree', 0) * 100}%"],
        '% Weight of Default': [params.get('scale_pos_weight', 'Not specified')],
        'AUC Train': [auc_train],
        'AUC Test 1': [auc_test1],
        'AUC Test 2': [auc_test2]
    })
    
    # Append the temporary DataFrame to the main results DataFrame
    results_df_xgb = pd.concat([results_df_xgb, temp_df], ignore_index=True)

# # Optionally, save the results to a CSV file
results_df_xgb.to_csv('grid_search_results_xgb.csv', index=False)

# Display the first few rows of the results DataFrame
results_df_xgb.head()
/tmp/ipykernel_9594/2378702306.py:42: FutureWarning:

The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.

Out[52]:
n_estimators learning_rate Subsample % Features % Weight of Default AUC Train AUC Test 1 AUC Test 2
0 50 0.01 50.0% 50.0% 1 0.928372 0.928303 0.926287
1 50 0.01 80.0% 50.0% 1 0.928376 0.928324 0.926273
2 50 0.01 50.0% 50.0% 5 0.926768 0.926775 0.925061
3 50 0.01 80.0% 50.0% 5 0.926663 0.926621 0.924905
4 50 0.01 50.0% 50.0% 10 0.926124 0.925888 0.924271
In [53]:
results_df_xgb
Out[53]:
n_estimators learning_rate Subsample % Features % Weight of Default AUC Train AUC Test 1 AUC Test 2
0 50 0.01 50.0% 50.0% 1 0.928372 0.928303 0.926287
1 50 0.01 80.0% 50.0% 1 0.928376 0.928324 0.926273
2 50 0.01 50.0% 50.0% 5 0.926768 0.926775 0.925061
3 50 0.01 80.0% 50.0% 5 0.926663 0.926621 0.924905
4 50 0.01 50.0% 50.0% 10 0.926124 0.925888 0.924271
5 50 0.01 80.0% 50.0% 10 0.926137 0.925835 0.924257
6 100 0.01 50.0% 50.0% 1 0.930257 0.930116 0.928110
7 100 0.01 80.0% 50.0% 1 0.930245 0.930050 0.928078
8 100 0.01 50.0% 50.0% 5 0.928713 0.928592 0.926844
9 100 0.01 80.0% 50.0% 5 0.928743 0.928570 0.926841
10 100 0.01 50.0% 50.0% 10 0.928194 0.927968 0.926264
11 100 0.01 80.0% 50.0% 10 0.928240 0.927970 0.926294
12 300 0.01 50.0% 50.0% 1 0.935255 0.934829 0.932786
13 300 0.01 80.0% 50.0% 1 0.935363 0.934800 0.932798
14 300 0.01 50.0% 50.0% 5 0.934256 0.933868 0.931979
15 300 0.01 80.0% 50.0% 5 0.934359 0.933898 0.931973
16 300 0.01 50.0% 50.0% 10 0.933984 0.933526 0.931654
17 300 0.01 80.0% 50.0% 10 0.934116 0.933613 0.931757
18 50 0.10 50.0% 50.0% 1 0.937436 0.936724 0.934535
19 50 0.10 80.0% 50.0% 1 0.937710 0.936669 0.934671
20 50 0.10 50.0% 50.0% 5 0.937022 0.936154 0.934143
21 50 0.10 80.0% 50.0% 5 0.937327 0.936293 0.934288
22 50 0.10 50.0% 50.0% 10 0.936832 0.935928 0.934052
23 50 0.10 80.0% 50.0% 10 0.937251 0.936210 0.934328
24 100 0.10 50.0% 50.0% 1 0.941613 0.939421 0.937452
25 100 0.10 80.0% 50.0% 1 0.942032 0.939454 0.937744
26 100 0.10 50.0% 50.0% 5 0.941202 0.938948 0.936946
27 100 0.10 80.0% 50.0% 5 0.941761 0.939196 0.937284
28 100 0.10 50.0% 50.0% 10 0.940957 0.938643 0.936980
29 100 0.10 80.0% 50.0% 10 0.941552 0.938998 0.937176
30 300 0.10 50.0% 50.0% 1 0.950337 0.942353 0.940698
31 300 0.10 80.0% 50.0% 1 0.951250 0.943107 0.941669
32 300 0.10 50.0% 50.0% 5 0.949579 0.941566 0.940027
33 300 0.10 80.0% 50.0% 5 0.950630 0.942517 0.941033
34 300 0.10 50.0% 50.0% 10 0.948608 0.940792 0.939439
35 300 0.10 80.0% 50.0% 10 0.949791 0.941820 0.940312
36 50 0.01 50.0% 100.0% 1 0.925120 0.925311 0.922637
37 50 0.01 80.0% 100.0% 1 0.924699 0.924838 0.922121
38 50 0.01 50.0% 100.0% 5 0.923819 0.923977 0.921518
39 50 0.01 80.0% 100.0% 5 0.923333 0.923434 0.920992
40 50 0.01 50.0% 100.0% 10 0.922479 0.922612 0.919991
41 50 0.01 80.0% 100.0% 10 0.922311 0.922456 0.919779
42 100 0.01 50.0% 100.0% 1 0.928229 0.928273 0.925745
43 100 0.01 80.0% 100.0% 1 0.927909 0.927887 0.925345
44 100 0.01 50.0% 100.0% 5 0.926455 0.926499 0.924102
45 100 0.01 80.0% 100.0% 5 0.926160 0.926101 0.923743
46 100 0.01 50.0% 100.0% 10 0.925621 0.925564 0.923127
47 100 0.01 80.0% 100.0% 10 0.925468 0.925264 0.922913
48 300 0.01 50.0% 100.0% 1 0.935036 0.934519 0.932142
49 300 0.01 80.0% 100.0% 1 0.935049 0.934341 0.931995
50 300 0.01 50.0% 100.0% 5 0.934065 0.933575 0.931288
51 300 0.01 80.0% 100.0% 5 0.934097 0.933516 0.931220
52 300 0.01 50.0% 100.0% 10 0.933781 0.933239 0.930966
53 300 0.01 80.0% 100.0% 10 0.933785 0.933121 0.930876
54 50 0.10 50.0% 100.0% 1 0.937919 0.936702 0.934670
55 50 0.10 80.0% 100.0% 1 0.938281 0.936941 0.934706
56 50 0.10 50.0% 100.0% 5 0.937449 0.936355 0.934286
57 50 0.10 80.0% 100.0% 5 0.937841 0.936566 0.934483
58 50 0.10 50.0% 100.0% 10 0.937397 0.936300 0.934179
59 50 0.10 80.0% 100.0% 10 0.937579 0.936349 0.934352
60 100 0.10 50.0% 100.0% 1 0.942236 0.939463 0.937600
61 100 0.10 80.0% 100.0% 1 0.942833 0.939768 0.937917
62 100 0.10 50.0% 100.0% 5 0.941946 0.939199 0.937312
63 100 0.10 80.0% 100.0% 5 0.942456 0.939453 0.937724
64 100 0.10 50.0% 100.0% 10 0.941646 0.938857 0.936975
65 100 0.10 80.0% 100.0% 10 0.942023 0.939201 0.937381
66 300 0.10 50.0% 100.0% 1 0.951841 0.942651 0.941195
67 300 0.10 80.0% 100.0% 1 0.952678 0.943455 0.941911
68 300 0.10 50.0% 100.0% 5 0.951079 0.941968 0.940384
69 300 0.10 80.0% 100.0% 5 0.951965 0.942892 0.941367
70 300 0.10 50.0% 100.0% 10 0.949800 0.941164 0.939527
71 300 0.10 80.0% 100.0% 10 0.951010 0.942212 0.940624
In [54]:
# calculate the mean auc
results_df_xgb['Average AUC'] = results_df_xgb[['AUC Train', 'AUC Test 1', 'AUC Test 2']].mean(axis=1)

# calculate
results_df_xgb['Std AUC'] = results_df_xgb[['AUC Train', 'AUC Test 1', 'AUC Test 2']].std(axis=1)

results_df_xgb
# results_df_xgb.iloc[:, 5:]
Out[54]:
n_estimators learning_rate Subsample % Features % Weight of Default AUC Train AUC Test 1 AUC Test 2 Average AUC Std AUC
0 50 0.01 50.0% 50.0% 1 0.928372 0.928303 0.926287 0.927654 0.001184
1 50 0.01 80.0% 50.0% 1 0.928376 0.928324 0.926273 0.927658 0.001200
2 50 0.01 50.0% 50.0% 5 0.926768 0.926775 0.925061 0.926201 0.000988
3 50 0.01 80.0% 50.0% 5 0.926663 0.926621 0.924905 0.926063 0.001003
4 50 0.01 50.0% 50.0% 10 0.926124 0.925888 0.924271 0.925428 0.001009
5 50 0.01 80.0% 50.0% 10 0.926137 0.925835 0.924257 0.925410 0.001010
6 100 0.01 50.0% 50.0% 1 0.930257 0.930116 0.928110 0.929495 0.001201
7 100 0.01 80.0% 50.0% 1 0.930245 0.930050 0.928078 0.929458 0.001199
8 100 0.01 50.0% 50.0% 5 0.928713 0.928592 0.926844 0.928050 0.001046
9 100 0.01 80.0% 50.0% 5 0.928743 0.928570 0.926841 0.928051 0.001051
10 100 0.01 50.0% 50.0% 10 0.928194 0.927968 0.926264 0.927476 0.001055
11 100 0.01 80.0% 50.0% 10 0.928240 0.927970 0.926294 0.927501 0.001054
12 300 0.01 50.0% 50.0% 1 0.935255 0.934829 0.932786 0.934290 0.001320
13 300 0.01 80.0% 50.0% 1 0.935363 0.934800 0.932798 0.934320 0.001348
14 300 0.01 50.0% 50.0% 5 0.934256 0.933868 0.931979 0.933367 0.001218
15 300 0.01 80.0% 50.0% 5 0.934359 0.933898 0.931973 0.933410 0.001266
16 300 0.01 50.0% 50.0% 10 0.933984 0.933526 0.931654 0.933054 0.001235
17 300 0.01 80.0% 50.0% 10 0.934116 0.933613 0.931757 0.933162 0.001243
18 50 0.10 50.0% 50.0% 1 0.937436 0.936724 0.934535 0.936232 0.001512
19 50 0.10 80.0% 50.0% 1 0.937710 0.936669 0.934671 0.936350 0.001545
20 50 0.10 50.0% 50.0% 5 0.937022 0.936154 0.934143 0.935773 0.001477
21 50 0.10 80.0% 50.0% 5 0.937327 0.936293 0.934288 0.935969 0.001545
22 50 0.10 50.0% 50.0% 10 0.936832 0.935928 0.934052 0.935604 0.001418
23 50 0.10 80.0% 50.0% 10 0.937251 0.936210 0.934328 0.935930 0.001482
24 100 0.10 50.0% 50.0% 1 0.941613 0.939421 0.937452 0.939495 0.002082
25 100 0.10 80.0% 50.0% 1 0.942032 0.939454 0.937744 0.939743 0.002159
26 100 0.10 50.0% 50.0% 5 0.941202 0.938948 0.936946 0.939032 0.002129
27 100 0.10 80.0% 50.0% 5 0.941761 0.939196 0.937284 0.939414 0.002246
28 100 0.10 50.0% 50.0% 10 0.940957 0.938643 0.936980 0.938860 0.001997
29 100 0.10 80.0% 50.0% 10 0.941552 0.938998 0.937176 0.939242 0.002198
30 300 0.10 50.0% 50.0% 1 0.950337 0.942353 0.940698 0.944462 0.005154
31 300 0.10 80.0% 50.0% 1 0.951250 0.943107 0.941669 0.945342 0.005167
32 300 0.10 50.0% 50.0% 5 0.949579 0.941566 0.940027 0.943724 0.005128
33 300 0.10 80.0% 50.0% 5 0.950630 0.942517 0.941033 0.944727 0.005166
34 300 0.10 50.0% 50.0% 10 0.948608 0.940792 0.939439 0.942946 0.004949
35 300 0.10 80.0% 50.0% 10 0.949791 0.941820 0.940312 0.943974 0.005093
36 50 0.01 50.0% 100.0% 1 0.925120 0.925311 0.922637 0.924356 0.001492
37 50 0.01 80.0% 100.0% 1 0.924699 0.924838 0.922121 0.923886 0.001530
38 50 0.01 50.0% 100.0% 5 0.923819 0.923977 0.921518 0.923105 0.001376
39 50 0.01 80.0% 100.0% 5 0.923333 0.923434 0.920992 0.922586 0.001381
40 50 0.01 50.0% 100.0% 10 0.922479 0.922612 0.919991 0.921694 0.001476
41 50 0.01 80.0% 100.0% 10 0.922311 0.922456 0.919779 0.921515 0.001505
42 100 0.01 50.0% 100.0% 1 0.928229 0.928273 0.925745 0.927416 0.001447
43 100 0.01 80.0% 100.0% 1 0.927909 0.927887 0.925345 0.927047 0.001474
44 100 0.01 50.0% 100.0% 5 0.926455 0.926499 0.924102 0.925685 0.001371
45 100 0.01 80.0% 100.0% 5 0.926160 0.926101 0.923743 0.925334 0.001379
46 100 0.01 50.0% 100.0% 10 0.925621 0.925564 0.923127 0.924771 0.001424
47 100 0.01 80.0% 100.0% 10 0.925468 0.925264 0.922913 0.924548 0.001420
48 300 0.01 50.0% 100.0% 1 0.935036 0.934519 0.932142 0.933899 0.001543
49 300 0.01 80.0% 100.0% 1 0.935049 0.934341 0.931995 0.933795 0.001599
50 300 0.01 50.0% 100.0% 5 0.934065 0.933575 0.931288 0.932976 0.001482
51 300 0.01 80.0% 100.0% 5 0.934097 0.933516 0.931220 0.932944 0.001521
52 300 0.01 50.0% 100.0% 10 0.933781 0.933239 0.930966 0.932662 0.001494
53 300 0.01 80.0% 100.0% 10 0.933785 0.933121 0.930876 0.932594 0.001525
54 50 0.10 50.0% 100.0% 1 0.937919 0.936702 0.934670 0.936431 0.001642
55 50 0.10 80.0% 100.0% 1 0.938281 0.936941 0.934706 0.936643 0.001806
56 50 0.10 50.0% 100.0% 5 0.937449 0.936355 0.934286 0.936030 0.001607
57 50 0.10 80.0% 100.0% 5 0.937841 0.936566 0.934483 0.936296 0.001695
58 50 0.10 50.0% 100.0% 10 0.937397 0.936300 0.934179 0.935959 0.001636
59 50 0.10 80.0% 100.0% 10 0.937579 0.936349 0.934352 0.936093 0.001629
60 100 0.10 50.0% 100.0% 1 0.942236 0.939463 0.937600 0.939766 0.002333
61 100 0.10 80.0% 100.0% 1 0.942833 0.939768 0.937917 0.940173 0.002483
62 100 0.10 50.0% 100.0% 5 0.941946 0.939199 0.937312 0.939486 0.002330
63 100 0.10 80.0% 100.0% 5 0.942456 0.939453 0.937724 0.939878 0.002394
64 100 0.10 50.0% 100.0% 10 0.941646 0.938857 0.936975 0.939159 0.002350
65 100 0.10 80.0% 100.0% 10 0.942023 0.939201 0.937381 0.939535 0.002339
66 300 0.10 50.0% 100.0% 1 0.951841 0.942651 0.941195 0.945229 0.005772
67 300 0.10 80.0% 100.0% 1 0.952678 0.943455 0.941911 0.946015 0.005822
68 300 0.10 50.0% 100.0% 5 0.951079 0.941968 0.940384 0.944477 0.005772
69 300 0.10 80.0% 100.0% 5 0.951965 0.942892 0.941367 0.945408 0.005730
70 300 0.10 50.0% 100.0% 10 0.949800 0.941164 0.939527 0.943497 0.005520
71 300 0.10 80.0% 100.0% 10 0.951010 0.942212 0.940624 0.944615 0.005595
In [55]:
plt.figure(figsize=(10, 5))
plt.scatter(results_df_xgb['Average AUC'], results_df_xgb['Std AUC'])
xlab = 'Average AUC Scores'
ylab = 'Standard Deviation of AUC Scores'
title = 'XGBoost Model Average AUC vs Std AUC'
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.title(title)
plt.show()
No description has been provided for this image
In [56]:
plt.figure(figsize=(10, 5))
plt.scatter(results_df_xgb['AUC Train'], results_df_xgb['AUC Test 2'])
xlab = 'AUC of Train sample'
ylab = 'AUC of Test 2'
title = 'Train AUC vs Test_2 AUC'
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.title(title)
plt.show()
No description has been provided for this image

Best XGB Model Based on Bias and Variance¶

In [57]:
# Calculate the absolute difference between 'AUC Train' and 'AUC Test 1'
results_df_xgb['AUC Diff'] = abs(results_df_xgb['AUC Train'] - results_df_xgb['AUC Test 1'])

# Find the minimum difference to identify the models with the closest train and test performance
min_diff = results_df_xgb['AUC Diff'].min()

# Filter the DataFrame to rows that match the minimum difference
min_diff_rows = results_df_xgb[results_df_xgb['AUC Diff'] == min_diff]

# From those rows, find the one with the highest 'AUC Test 2'
highest_test_auc_index = min_diff_rows['AUC Test 2'].idxmax()

# Select the best row based on the criteria
best_model_xgb = results_df_xgb.loc[highest_test_auc_index]

print("Row with the highest 'AUC Test 2' and smallest difference between 'AUC Train' and 'AUC Test 2':")
print(best_model_xgb)
Row with the highest 'AUC Test 2' and smallest difference between 'AUC Train' and 'AUC Test 2':
n_estimators                 50
learning_rate              0.01
Subsample %               50.0%
Features                  50.0%
% Weight of Default           5
AUC Train              0.926768
AUC Test 1             0.926775
AUC Test 2             0.925061
Average AUC            0.926201
Std AUC                0.000988
AUC Diff               0.000007
Name: 2, dtype: object
In [58]:
best_xgb_model_params = {
    'n_estimators': best_model_xgb['n_estimators'],
    'learning_rate': best_model_xgb['learning_rate'],
    'subsample': float(best_model_xgb['Subsample %'][:-1]) / 100.0,
    'colsample_bytree': float(best_model_xgb['Features'][:-1]) / 100.0,
    'scale_pos_weight': best_model_xgb['% Weight of Default']
}
In [59]:
xgb_final = XGBClassifier(**best_xgb_model_params)
xgb_final.fit(X_train, y_train)
Out[59]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=50, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=50, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
In [60]:
print(xgb_final.get_params())
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.5, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.01, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 50, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 5, 'subsample': 0.5, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}
In [61]:
# Get feature importances
feature_importances_3 = xgb_final.feature_importances_

# Print feature importance for features with importance > 0.5%
print("Features with importance higher than 0.5%:")
for feature_name, importance in zip(X_train.columns, feature_importances_3):
    if importance > 0.005:
        print(f'Feature: {feature_name}, Importance: {importance}')
Features with importance higher than 0.5%:
Feature: D_52, Importance: 0.01048071775585413
Feature: B_2, Importance: 0.013965119607746601
Feature: S_7, Importance: 0.00853777676820755
Feature: B_7, Importance: 0.16809943318367004
Feature: P_2, Importance: 0.19531044363975525
Feature: B_3, Importance: 0.02020329236984253
Feature: D_45, Importance: 0.019460203126072884
Feature: D_44, Importance: 0.03430410474538803
Feature: B_1, Importance: 0.05245744436979294
Feature: B_37, Importance: 0.05257605016231537
Feature: P_Total, Importance: 0.011594913899898529
Feature: B_Total, Importance: 0.006424103397876024
Feature: D_75, Importance: 0.03697102516889572
Feature: R_27, Importance: 0.005594146903604269
Feature: B_9, Importance: 0.03343643620610237
Feature: D_48, Importance: 0.21153990924358368
Feature: B_10, Importance: 0.03539016842842102
Feature: B_Ave, Importance: 0.008054395206272602
Feature: S_3, Importance: 0.007965869270265102
Feature: D_42, Importance: 0.016912920400500298
In [62]:
# Plotting Top 10 features
from xgboost import plot_importance

plt.figure(figsize=(15, 20))
plot_importance(xgb_final, max_num_features=20)
plt.title('Top 20 Feature Importance')
plt.show()
<Figure size 1500x2000 with 0 Axes>
No description has been provided for this image

Model Performance Functions¶

In [63]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve

def model_performance(model_name,
                      X_train_data,
                      y_train_data, 
                      X_test_data1,
                      y_test_data1,
                      X_test_data2,
                      y_test_data2): 

    train_preds = model_name.predict(X_train_data)
    test_preds1= model_name.predict(X_test_data1)
    test_preds2 = model_name.predict(X_test_data2)

    
    a = classification_report(y_train_data, train_preds)
    b = classification_report(y_test_data1, test_preds1)
    e = classification_report(y_test_data2, test_preds2)
    
    c = round(model_name.score(X_train_data, y_train_data),4)
    d = round(model_name.score(X_test_data1, y_test_data1),4)
    f = round(model_name.score(X_test_data2, y_test_data2),4)
    
    print('classification report for training data')
    print(a)
    print('\n')
    print('classification report for testing data 1')
    print(b)
    print('\n')
    print('classification report for testing data 2')
    print(e)
    print('\n')
    print('the model score for training data is ',c)
    print('the model score for testing data 1 is ',d)
    print('the model score for testing data 2 is ',f)
    print('\n')
    
    
    from seaborn import heatmap
    fig, axes = plt.subplots(ncols=3)
    fig.set_size_inches(20,5)

    plt.subplot(1,3,1)
    sns.heatmap(confusion_matrix(y_train_data, train_preds), annot=True, cbar=False, fmt='d')
    plt.xlabel('predicted labels')
    plt.ylabel('actual labels')
    plt.title('Confusion Matrix for Training Data')
    
    
    plt.subplot(1,3,2) 
    sns.heatmap(confusion_matrix(y_test_data1, test_preds1), annot=True, cbar=False, fmt='d')
    plt.xlabel('predicted labels')
    plt.ylabel('actual labels')
    plt.title('Confusion Matrix for Testing Data 1')


    plt.subplot(1,3,3) 
    sns.heatmap(confusion_matrix(y_test_data2, test_preds2), annot=True, cbar=False, fmt='d')
    plt.xlabel('predicted labels')
    plt.ylabel('actual labels')
    plt.title('Confusion Matrix for Testing Data 2')
In [64]:
# ROC AUC Plots
def roc_score_auc_curve(model_name,
                      X_train_data,
                      y_train_data, 
                      X_test_data1,
                      y_test_data1,
                      X_test_data2,
                      y_test_data2):
    
    train_preds = model_name.predict(X_train_data)
    test_preds1= model_name.predict(X_test_data1)

    
    a = round(roc_auc_score(y_train_data, model_name.predict_proba(X_train_data)[:,1]),4)
    b = round(roc_auc_score(y_test_data1, model_name.predict_proba(X_test_data1)[:,1]),4)
    c = round(roc_auc_score(y_test_data2, model_name.predict_proba(X_test_data2)[:,1]),4)
    
    print('AUC Score for Model on Training Data is',a)
    print('AUC Score for Model on Testing Data 1 is',b)
    print('AUC Score for Model on Testing Data 2 is',c)
    
    
    plt.figure(figsize=(12,7))
    
    train_fpr, train_tpr, train_thresholds = roc_curve(y_train_data, model_name.predict_proba(X_train_data)[:,1])
    plt.plot([0,1],[0,1], linestyle='--', color='g')
    plt.plot(train_fpr, train_tpr, marker='.') 
  

    test_fpr, test_tpr, test_thresholds = roc_curve(y_test_data1, model_name.predict_proba(X_test_data1)[:,1])
    plt.plot([0,1],[0,1], linestyle='--', color='g')
    plt.plot(test_fpr, test_tpr, marker='o') 


    test_fpr2, test_tpr2, test_thresholds2 = roc_curve(y_test_data2, model_name.predict_proba(X_test_data2)[:,1])
    plt.plot([0,1],[0,1], linestyle='--', color='g')
    plt.plot(test_fpr2, test_tpr2, marker='o')
    

Final XGBoost Model's Performance¶

In [65]:
model_performance(model_name=xgb_final,
                      X_train_data=X_train,
                      y_train_data=y_train, 
                      X_test_data1=X_test1,
                      y_test_data1=y_test1,
                      X_test_data2=X_test2,
                      y_test_data2=y_test2)
classification report for training data
              precision    recall  f1-score   support

           0       0.99      0.67      0.80    403583
           1       0.49      0.97      0.66    133850

    accuracy                           0.75    537433
   macro avg       0.74      0.82      0.73    537433
weighted avg       0.86      0.75      0.76    537433



classification report for testing data 1
              precision    recall  f1-score   support

           0       0.99      0.67      0.80     86376
           1       0.49      0.97      0.66     28788

    accuracy                           0.75    115164
   macro avg       0.74      0.82      0.73    115164
weighted avg       0.86      0.75      0.76    115164



classification report for testing data 2
              precision    recall  f1-score   support

           0       0.99      0.67      0.80     86273
           1       0.50      0.97      0.66     28892

    accuracy                           0.74    115165
   macro avg       0.74      0.82      0.73    115165
weighted avg       0.86      0.74      0.76    115165



the model score for training data is  0.7454
the model score for testing data 1 is  0.7451
the model score for testing data 2 is  0.7443


No description has been provided for this image
In [66]:
roc_score_auc_curve(model_name=xgb_final,
                      X_train_data=X_train,
                      y_train_data=y_train, 
                      X_test_data1=X_test1,
                      y_test_data1=y_test1,
                      X_test_data2=X_test2,
                      y_test_data2=y_test2)
AUC Score for Model on Training Data is 0.9268
AUC Score for Model on Testing Data 1 is 0.9267
AUC Score for Model on Testing Data 2 is 0.9252
No description has been provided for this image
In [67]:
def k_fold_cross_valscore(model_name,
                          x_train_data,
                          y_train_data,
                          folds
                         ):
    from sklearn.model_selection import cross_val_score
    import pandas as pd
    
    model_kfold_recall = cross_val_score(model_name, x_train_data, y_train_data,cv=folds, scoring='recall', verbose=0)
    model_kfold_accuracy = cross_val_score(model_name, x_train_data, y_train_data,cv=folds, scoring='accuracy', verbose=0)
    model_kfold_precision = cross_val_score(model_name, x_train_data, y_train_data,cv=folds, scoring='precision', verbose=0)
    model_kfold_f1 = cross_val_score(model_name, x_train_data, y_train_data,cv=folds, scoring='f1', verbose=0)
    
    a = pd.DataFrame(model_kfold_recall)
    b = pd.DataFrame(model_kfold_accuracy)
    c = pd.DataFrame(model_kfold_precision)
    d = pd.DataFrame(model_kfold_f1)

    cross_val_data = pd.concat([a,b,c,d], ignore_index=True, axis=1)
    cross_val_data.rename(columns={0:"Recall", 1:"Accuracy", 2:"Precision",3:"F1 Score"},
                        inplace=True)
       
   
    print('\n')
    print("The mean recall for the model after " ,folds," folds is ", np.mean(model_kfold_recall))
    print("The mean accuracy for model after ",folds," folds is", np.mean(model_kfold_accuracy))
    print("the mean precision for the model after ",folds," folds is",np.mean(model_kfold_precision))
    print("the mean f1 score for the model after ",folds," folds is", np.mean(model_kfold_f1))
    print("\n")
    return cross_val_data
In [68]:
# K- Fold Cross Validation for Training Data
k_fold_cross_valscore(model_name =xgb_final,
                      x_train_data = X_train,
                      y_train_data = y_train,
                      folds = 10)

The mean recall for the model after  10  folds is  0.9730070974971984
The mean accuracy for model after  10  folds is 0.7452761576856105
the mean precision for the model after  10  folds is 0.49422347482501905
the mean f1 score for the model after  10  folds is 0.6554961255020741


Out[68]:
Recall Accuracy Precision F1 Score
0 0.973104 0.743785 0.492718 0.654194
1 0.972507 0.745516 0.494454 0.655587
2 0.974673 0.745274 0.494223 0.655875
3 0.970564 0.744860 0.493785 0.654557
4 0.972731 0.743818 0.492753 0.654140
5 0.973552 0.745344 0.494291 0.655681
6 0.971236 0.744748 0.493677 0.654615
7 0.974001 0.745474 0.494425 0.655900
8 0.974972 0.750014 0.499044 0.660175
9 0.972731 0.743929 0.492864 0.654238
In [69]:
# K- Fold Cross Validation for Testing Data 1
k_fold_cross_valscore(model_name =xgb_final,
                      x_train_data = X_test1,
                      y_train_data = y_test1,
                      folds = 10)

The mean recall for the model after  10  folds is  0.973877888358367
The mean accuracy for model after  10  folds is 0.7497133795872017
the mean precision for the model after  10  folds is 0.49971810702330277
the mean f1 score for the model after  10  folds is 0.6604991904791444


Out[69]:
Recall Accuracy Precision F1 Score
0 0.971171 0.756187 0.506430 0.665714
1 0.974644 0.753234 0.503318 0.663828
2 0.977075 0.750369 0.500356 0.661804
3 0.979507 0.747417 0.497354 0.659726
4 0.967338 0.752345 0.502346 0.661283
5 0.977067 0.754342 0.504395 0.665326
6 0.974297 0.750782 0.500803 0.661557
7 0.967697 0.742185 0.492052 0.652383
8 0.974297 0.745832 0.495758 0.657140
9 0.975686 0.744443 0.494368 0.656232
In [70]:
# K- Fold Cross Validation for Testing Data 2
k_fold_cross_valscore(model_name =xgb_final,
                      x_train_data = X_test2,
                      y_train_data = y_test2,
                      folds = 10)

The mean recall for the model after  10  folds is  0.9737643441714846
The mean accuracy for model after  10  folds is 0.7449224839201262
the mean precision for the model after  10  folds is 0.4957565763441945
the mean f1 score for the model after  10  folds is 0.6570094538950928


Out[70]:
Recall Accuracy Precision F1 Score
0 0.976116 0.745854 0.496654 0.658340
1 0.973001 0.748025 0.498846 0.659550
2 0.975078 0.746375 0.497176 0.658562
3 0.972318 0.740644 0.491517 0.652957
4 0.975779 0.745854 0.496741 0.658340
5 0.968155 0.746092 0.496891 0.656727
6 0.975770 0.738625 0.489495 0.651943
7 0.973001 0.749218 0.500089 0.660635
8 0.976116 0.743314 0.494130 0.656119
9 0.972309 0.745224 0.496027 0.656922

Shap Analysis for XGBoost Model¶

In [110]:
import shap

# bee swarm plot
explainer = shap.Explainer(xgb_final)
shap_values = explainer(X_test2)

shap.plots.beeswarm(shap_values)
No description has been provided for this image
In [111]:
# waterfall plot
shap.plots.waterfall(shap_values[150])
No description has been provided for this image

Score Bins for Best XGBoost Model¶

In [73]:
# 1. Show the parameters of the final model
print("Parameters of the final model:", xgb_final.get_params())
print('\n')

# 2. Calculate and display AUC on each sample
y_train_pred = xgb_final.predict_proba(X_train)[:, 1]
y_test1_pred = xgb_final.predict_proba(X_test1)[:, 1]
y_test2_pred = xgb_final.predict_proba(X_test2)[:, 1]

auc_train = roc_auc_score(y_train, y_train_pred)
auc_test1 = roc_auc_score(y_test1, y_test1_pred)
auc_test2 = roc_auc_score(y_test2, y_test2_pred)

print(f"AUC on Train: {auc_train}")
print(f"AUC on Test 1: {auc_test1}")
print(f"AUC on Test 2: {auc_test2}")

# 3. Define score bins based on the train sample
train_scores = xgb_final.predict_proba(X_train)[:, 1]
bins = np.percentile(train_scores, [0, 25, 50, 75, 100])

# 4. Apply the same thresholds to test samples and calculate default rates
def calculate_default_rate(y_true, y_pred, bins):
    indices = np.digitize(y_pred, bins) - 1  # Find bin index for each prediction
    default_rate = [np.mean(y_true[indices == i]) for i in range(len(bins)-1)]
    return default_rate

default_rates_train = calculate_default_rate(y_train, y_train_pred, bins)
default_rates_test1 = calculate_default_rate(y_test1, y_test1_pred, bins)
default_rates_test2 = calculate_default_rate(y_test2, y_test2_pred, bins)


color_train = 'red'  # Red color
color_test1 = '#00CED1'  # Turquoise color (HEX code)
color_test2 = 'purple'  # Purple color


# 5. Show rank orderings in a Bar-Chart
width = 0.25  # Width of the bars
x_indexes = np.arange(len(bins)-1)

plt.figure(figsize=(12, 6))
plt.bar(x_indexes - width, default_rates_train, width=width, label='Train', color=color_train)
plt.bar(x_indexes, default_rates_test1, width=width, label='Test 1', color=color_test1)
plt.bar(x_indexes + width, default_rates_test2, width=width, label='Test 2', color=color_test2)

# Formatting the plot
plt.xlabel('Score Bins')
plt.ylabel('Default Rate')
plt.title('Rank Orderings by Score Bins')
plt.xticks(x_indexes, [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins)-1)])
plt.legend()
plt.grid(True)
plt.show()
Parameters of the final model: {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.5, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.01, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 50, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 5, 'subsample': 0.5, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}


AUC on Train: 0.9267799025399948
AUC on Test 1: 0.9267157509578828
AUC on Test 2: 0.9252101702245537
No description has been provided for this image
In [ ]:
 
In [ ]:
 

Outlier Analysis¶

In [74]:
df4 = df3.copy()
In [ ]:
 
In [75]:
print(features)
['D_52', 'B_Ave_3_months', 'D_112', 'B_38_2.0', 'B_2', 'R_Ave', 'R_Ave_12_months', 'S_7', 'R_1', 'B_38_4.0', 'R_26', 'B_7', 'D_41', 'P_2', 'D_129', 'D_132', 'B_3', 'D_49', 'D_45', 'D_44', 'B_1', 'D_46', 'B_37', 'P_Total', 'B_Total', 'D_51', 'S_Ave_6_months', 'D_75', 'R_27', 'B_9', 'B_Ave_6_months', 'D_79', 'D_48', 'S_23', 'D_43', 'B_4', 'B_10', 'B_Ave', 'R_Total', 'S_3', 'D_42']
In [77]:
num_plots = len(features)

num_rows = (num_plots + 1) // 2 # to make sure we have enough rows for odd number of variables
num_cols = 2

# create subplots with the above calculated num_rows and num_cols
fig, axes = plt.subplots(num_rows, num_cols, figsize=(12,80))

axes = axes.flatten()

# iterate over the columns and create boxplots
for i, column in enumerate(features):
    sns.boxplot(x = column, data=df4, ax=axes[i]) #orient='v')
    #sns.despine(offset = 10, trim=True, ax=axes[i])

# hide any empty subplots
# for j in range(len(num), len(axes)):
#     fig.delaxes(axes[j])

plt.tight_layout()
plt.show()
No description has been provided for this image
In [78]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Step 1: Cap and floor observations at 1st and 99th percentiles (Train sample)
lower_percentile = df4.drop(["customer_ID", "S_2", "target"], axis=1).quantile(0.01)
upper_percentile = df4.drop(["customer_ID", "S_2", "target"], axis=1).quantile(0.99)
df4_out_removed = df4.drop(["customer_ID", "S_2", "target"], axis=1).clip(lower=lower_percentile,
                                                                     upper=upper_percentile,
                                                                     axis=1)

# Step 3: Replace missing values with 0
df4_out_removed = df4_out_removed.fillna(0)

# Step 2: Standardize (normalize) the data based on Mean and Standard Deviation from Train sample
scaler = StandardScaler()

# Fit the scaler on the Train sample data
scaler.fit(df4_out_removed)

# Transform the Train, Test1, and Test2 samples
X = df4_out_removed  # Use the capped and standardized data
y = df4['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Apply the same scaling to all data subsets
X_train = scaler.transform(X_train)
X_test1 = scaler.transform(X_test1)
X_test2 = scaler.transform(X_test2)
In [ ]:
 

Neural Network¶

In [79]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
2024-04-06 13:42:32.857711: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-06 13:42:32.860904: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-06 13:42:32.894045: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-06 13:42:33.721393: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
In [80]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import pandas as pd
from sklearn.metrics import roc_auc_score

# Assuming your datasets are defined: X_train, y_train, X_test1, y_test1, X_test2, y_test2

def build_model(hidden_layers, nodes, activation, dropout, input_shape):
    model = Sequential()
    model.add(tf.keras.Input(shape=(input_shape,)))  # Adjust based on your dataset
    for _ in range(hidden_layers):
        model.add(Dense(nodes, activation=activation))
        if dropout < 1.0:
            model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification
    return model

results_list = []  # List to hold all results before creating DataFrame

parameter_grid = [
    (hl, n, af, d, bs)
    for hl in [2, 4]
    for n in [4, 6]
    for af in ['relu', 'tanh']
    for d in [0.5, 0.0]  # 0.0 represents no dropout
    for bs in [100, 10000]
]

input_shape = X_train.shape[1]  # Ensure this matches your dataset

for hl, n, af, d, bs in parameter_grid:
    model = build_model(hl, n, af, d, input_shape)
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name='auc')])
    
    # Train the model
    model.fit(X_train, y_train, epochs=20, batch_size=bs, verbose=0)
    
    # Evaluate model performance using roc_auc_score
    auc_train = roc_auc_score(y_train, model.predict(X_train).ravel())
    auc_test_1 = roc_auc_score(y_test1, model.predict(X_test1).ravel())
    auc_test_2 = roc_auc_score(y_test2, model.predict(X_test2).ravel())
    
    # Collect results
    results_list.append({
        '# HL': hl,
        '# Node': n,
        'Activation Function': af,
        'Dropout': d,
        'Batch Size': bs,
        'AUC Train': auc_train,
        'AUC Test 1': auc_test_1,
        'AUC Test 2': auc_test_2
    })

# Convert the list of dictionaries to a DataFrame
results_df_nn = pd.DataFrame(results_list)

# Save the DataFrame to a CSV file
results_df_nn.to_csv('grid_search_results_nn.csv', index=False)

print("Grid search completed and results saved.")
2024-04-06 13:42:36.684163: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-06 13:42:36.685587: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 582us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 559us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 570us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 553us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 590us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 557us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 514us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 533us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 571us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 8s 499us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 564us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 627us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 533us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 551us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 518us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 550us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 592us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 561us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 547us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 571us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 557us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 560us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 539us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 560us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 567us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 625us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 625us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 559us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 604us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 627us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 555us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 580us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 598us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 617us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 627us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 608us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 576us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 566us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 584us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 550us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 584us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 615us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 582us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 613us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 591us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 611us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 644us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 597us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 607us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 669us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 624us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 575us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 618us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 622us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 602us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 607us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 648us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 585us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 612us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 659us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 604us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 625us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 635us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 606us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 664us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 654us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 597us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 596us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 576us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 588us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 627us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 583us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 597us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 631us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 641us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 600us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 614us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 610us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 11s 641us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 619us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 610us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 615us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 640us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 615us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 615us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 613us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 585us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 612us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 615us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 622us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 11s 648us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 621us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 679us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 594us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 635us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 637us/step
Grid search completed and results saved.
In [81]:
results_df_nn
Out[81]:
# HL # Node Activation Function Dropout Batch Size AUC Train AUC Test 1 AUC Test 2
0 2 4 relu 0.5 100 0.909950 0.910248 0.909084
1 2 4 relu 0.5 10000 0.918418 0.919725 0.917964
2 2 4 relu 0.0 100 0.933205 0.934030 0.932357
3 2 4 relu 0.0 10000 0.927684 0.928676 0.927315
4 2 4 tanh 0.5 100 0.930401 0.931599 0.929836
5 2 4 tanh 0.5 10000 0.930554 0.931703 0.930125
6 2 4 tanh 0.0 100 0.933622 0.934339 0.932977
7 2 4 tanh 0.0 10000 0.930719 0.931448 0.929854
8 2 6 relu 0.5 100 0.931800 0.933030 0.931301
9 2 6 relu 0.5 10000 0.929381 0.930490 0.929016
10 2 6 relu 0.0 100 0.934423 0.935323 0.933645
11 2 6 relu 0.0 10000 0.931264 0.932065 0.930585
12 2 6 tanh 0.5 100 0.929642 0.930880 0.929157
13 2 6 tanh 0.5 10000 0.930797 0.932000 0.930362
14 2 6 tanh 0.0 100 0.934677 0.935297 0.933549
15 2 6 tanh 0.0 10000 0.930554 0.931508 0.929754
16 4 4 relu 0.5 100 0.929322 0.930723 0.928703
17 4 4 relu 0.5 10000 0.891645 0.892041 0.891669
18 4 4 relu 0.0 100 0.933497 0.934424 0.932836
19 4 4 relu 0.0 10000 0.931591 0.932588 0.930949
20 4 4 tanh 0.5 100 0.929506 0.930680 0.929208
21 4 4 tanh 0.5 10000 0.929591 0.930628 0.929005
22 4 4 tanh 0.0 100 0.933546 0.934173 0.932892
23 4 4 tanh 0.0 10000 0.930216 0.931005 0.929529
24 4 6 relu 0.5 100 0.930308 0.931446 0.929894
25 4 6 relu 0.5 10000 0.910467 0.910680 0.909214
26 4 6 relu 0.0 100 0.934501 0.935337 0.933631
27 4 6 relu 0.0 10000 0.927674 0.928096 0.926558
28 4 6 tanh 0.5 100 0.929873 0.930968 0.929383
29 4 6 tanh 0.5 10000 0.930363 0.931508 0.929952
30 4 6 tanh 0.0 100 0.934569 0.935102 0.933525
31 4 6 tanh 0.0 10000 0.931858 0.932815 0.931126
In [82]:
# calculate the mean auc
results_df_nn['Average AUC'] = results_df_nn[['AUC Train', 'AUC Test 1', 'AUC Test 2']].mean(axis=1)
# calculate the std for auc
results_df_nn['Std AUC'] = results_df_nn[['AUC Train', 'AUC Test 1', 'AUC Test 2']].std(axis=1)

results_df_nn
# results_df_xgb.iloc[:, 5:]
Out[82]:
# HL # Node Activation Function Dropout Batch Size AUC Train AUC Test 1 AUC Test 2 Average AUC Std AUC
0 2 4 relu 0.5 100 0.909950 0.910248 0.909084 0.909760 0.000605
1 2 4 relu 0.5 10000 0.918418 0.919725 0.917964 0.918703 0.000914
2 2 4 relu 0.0 100 0.933205 0.934030 0.932357 0.933197 0.000837
3 2 4 relu 0.0 10000 0.927684 0.928676 0.927315 0.927892 0.000704
4 2 4 tanh 0.5 100 0.930401 0.931599 0.929836 0.930612 0.000900
5 2 4 tanh 0.5 10000 0.930554 0.931703 0.930125 0.930794 0.000816
6 2 4 tanh 0.0 100 0.933622 0.934339 0.932977 0.933646 0.000682
7 2 4 tanh 0.0 10000 0.930719 0.931448 0.929854 0.930674 0.000798
8 2 6 relu 0.5 100 0.931800 0.933030 0.931301 0.932044 0.000890
9 2 6 relu 0.5 10000 0.929381 0.930490 0.929016 0.929629 0.000767
10 2 6 relu 0.0 100 0.934423 0.935323 0.933645 0.934464 0.000839
11 2 6 relu 0.0 10000 0.931264 0.932065 0.930585 0.931305 0.000740
12 2 6 tanh 0.5 100 0.929642 0.930880 0.929157 0.929893 0.000888
13 2 6 tanh 0.5 10000 0.930797 0.932000 0.930362 0.931053 0.000848
14 2 6 tanh 0.0 100 0.934677 0.935297 0.933549 0.934508 0.000886
15 2 6 tanh 0.0 10000 0.930554 0.931508 0.929754 0.930605 0.000879
16 4 4 relu 0.5 100 0.929322 0.930723 0.928703 0.929583 0.001035
17 4 4 relu 0.5 10000 0.891645 0.892041 0.891669 0.891785 0.000222
18 4 4 relu 0.0 100 0.933497 0.934424 0.932836 0.933585 0.000798
19 4 4 relu 0.0 10000 0.931591 0.932588 0.930949 0.931710 0.000826
20 4 4 tanh 0.5 100 0.929506 0.930680 0.929208 0.929798 0.000778
21 4 4 tanh 0.5 10000 0.929591 0.930628 0.929005 0.929741 0.000822
22 4 4 tanh 0.0 100 0.933546 0.934173 0.932892 0.933537 0.000640
23 4 4 tanh 0.0 10000 0.930216 0.931005 0.929529 0.930250 0.000739
24 4 6 relu 0.5 100 0.930308 0.931446 0.929894 0.930549 0.000804
25 4 6 relu 0.5 10000 0.910467 0.910680 0.909214 0.910120 0.000793
26 4 6 relu 0.0 100 0.934501 0.935337 0.933631 0.934490 0.000853
27 4 6 relu 0.0 10000 0.927674 0.928096 0.926558 0.927443 0.000794
28 4 6 tanh 0.5 100 0.929873 0.930968 0.929383 0.930075 0.000811
29 4 6 tanh 0.5 10000 0.930363 0.931508 0.929952 0.930608 0.000806
30 4 6 tanh 0.0 100 0.934569 0.935102 0.933525 0.934399 0.000802
31 4 6 tanh 0.0 10000 0.931858 0.932815 0.931126 0.931933 0.000847
In [83]:
plt.figure(figsize=(10, 5))
plt.scatter(results_df_nn['Average AUC'], results_df_nn['Std AUC'])
xlab = 'Average AUC Scores for Neural Net Model'
ylab = 'Standard Deviation of AUC Scores'
title = 'XGBoost Model Average AUC vs Std'
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.title(title)
plt.show()
No description has been provided for this image
In [84]:
plt.figure(figsize=(10, 5))
plt.scatter(results_df_nn['AUC Train'], results_df_nn['AUC Test 2'])
xlab = 'AUC of Train sample for Neural Net Model'
ylab = 'AUC of Test 2'
title = 'Train AUC vs Test_2 AUC'
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.title(title)
plt.show()
No description has been provided for this image

Best Neural Network based on Bias-Variance¶

In [85]:
# Analyze the results to get the optimal parameters
# Load the results
# results_df_nn = pd.read_csv('grid_search_results_nn.csv')
results_df_nn
# Find the row with the highest test AUC and smallest difference between train and test AUC
results_df_nn['AUC Diff'] = abs(results_df_nn['AUC Train'] - results_df_nn[['AUC Test 1', 'AUC Test 2']].mean(axis=1))
best_model_nn = results_df_nn.loc[(results_df_nn[['AUC Test 1', 'AUC Test 2']].mean(axis=1) - results_df_nn['AUC Diff']).idxmax()]

print("Optimal Parameters:")
print(best_model_nn)
Optimal Parameters:
# HL                          4
# Node                        6
Activation Function        relu
Dropout                     0.0
Batch Size                  100
AUC Train              0.934501
AUC Test 1             0.935337
AUC Test 2             0.933631
Average AUC             0.93449
Std AUC                0.000853
AUC Diff               0.000016
Name: 26, dtype: object
In [86]:
# Extracting parameters from best_model_nn
hl = int(best_model_nn['# HL'])  # Number of hidden layers
n = int(best_model_nn['# Node'])  # Nodes per layer
af = best_model_nn['Activation Function']  # Activation function
d = float(best_model_nn['Dropout'])  # Dropout rate
bs = int(best_model_nn['Batch Size'])  # Batch size

# Define the final model (nn_final) with optimum parameters
nn_final = Sequential()
nn_final.add(tf.keras.Input(shape=(X_train.shape[1],)))  # Use the actual input shape of your dataset
for _ in range(hl):
    nn_final.add(Dense(n, activation=af))
    if d < 1.0:
        nn_final.add(Dropout(d))
nn_final.add(Dense(1, activation='sigmoid'))  # Assuming a binary classification task
nn_final.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name='auc')])

# Train the model (nn_final) with optimal parameters
nn_final.fit(X_train, y_train, epochs=20, batch_size=bs, verbose=1)
Epoch 1/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - auc: 0.8941 - loss: 0.3413
Epoch 2/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9315 - loss: 0.2839
Epoch 3/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9327 - loss: 0.2804
Epoch 4/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9332 - loss: 0.2794
Epoch 5/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9330 - loss: 0.2801
Epoch 6/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9330 - loss: 0.2801
Epoch 7/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9334 - loss: 0.2791
Epoch 8/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9331 - loss: 0.2797
Epoch 9/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9333 - loss: 0.2796
Epoch 10/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9329 - loss: 0.2803
Epoch 11/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9340 - loss: 0.2780
Epoch 12/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9331 - loss: 0.2802
Epoch 13/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9340 - loss: 0.2777
Epoch 14/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9328 - loss: 0.2803
Epoch 15/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9333 - loss: 0.2791
Epoch 16/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9342 - loss: 0.2780
Epoch 17/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9334 - loss: 0.2789
Epoch 18/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9341 - loss: 0.2775
Epoch 19/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9334 - loss: 0.2788
Epoch 20/20
5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9340 - loss: 0.2781
Out[86]:
<keras.src.callbacks.history.History at 0x78e78c796970>
In [87]:
print(nn_final)
<Sequential name=sequential_32, built=True>
In [88]:
# Evaluate the model
auc_train = roc_auc_score(y_train, nn_final.predict(X_train).ravel())
auc_test_1 = roc_auc_score(y_test1, nn_final.predict(X_test1).ravel())
print(f"Retrained Model AUC: Train={auc_train}, Test1={auc_test_1}")
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 605us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 622us/step
Retrained Model AUC: Train=0.9340498172065264, Test1=0.9346269463697459
In [89]:
from sklearn.metrics import classification_report, confusion_matrix

def model_performance(model, X_train, y_train, X_test1, y_test1, X_test2, y_test2): 
    # Getting predictions
    train_preds = model.predict(X_train).ravel()
    test_preds1 = model.predict(X_test1).ravel()
    test_preds2 = model.predict(X_test2).ravel()
    
    # Converting probabilities to class labels based on 0.5 threshold
    train_preds = np.where(train_preds > 0.5, 1, 0)
    test_preds1 = np.where(test_preds1 > 0.5, 1, 0)
    test_preds2 = np.where(test_preds2 > 0.5, 1, 0)

    # Classification reports
    print('Classification report for training data')
    print(classification_report(y_train, train_preds))
    print('Classification report for testing data 1')
    print(classification_report(y_test1, test_preds1))
    print('Classification report for testing data 2')
    print(classification_report(y_test2, test_preds2))

    # Plotting confusion matrices
    fig, axes = plt.subplots(1, 3, figsize=(20, 5))
    
    sns.heatmap(confusion_matrix(y_train, train_preds), annot=True, cbar=False, fmt='d', ax=axes[0])
    axes[0].set_xlabel('Predicted labels')
    axes[0].set_ylabel('Actual labels')
    axes[0].set_title('Confusion Matrix for Training Data')
    
    sns.heatmap(confusion_matrix(y_test1, test_preds1), annot=True, cbar=False, fmt='d', ax=axes[1])
    axes[1].set_xlabel('Predicted labels')
    axes[1].set_ylabel('Actual labels')
    axes[1].set_title('Confusion Matrix for Testing Data 1')
    
    sns.heatmap(confusion_matrix(y_test2, test_preds2), annot=True, cbar=False, fmt='d', ax=axes[2])
    axes[2].set_xlabel('Predicted labels')
    axes[2].set_ylabel('Actual labels')
    axes[2].set_title('Confusion Matrix for Testing Data 2')
    
    plt.tight_layout()
    plt.show()
In [90]:
model_performance(nn_final, X_train, y_train, X_test1, y_test1, X_test2, y_test2)
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 598us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 633us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 665us/step
Classification report for training data
              precision    recall  f1-score   support

           0       0.92      0.91      0.91    403583
           1       0.74      0.76      0.75    133850

    accuracy                           0.87    537433
   macro avg       0.83      0.84      0.83    537433
weighted avg       0.87      0.87      0.87    537433

Classification report for testing data 1
              precision    recall  f1-score   support

           0       0.92      0.91      0.91     86376
           1       0.74      0.76      0.75     28788

    accuracy                           0.87    115164
   macro avg       0.83      0.84      0.83    115164
weighted avg       0.87      0.87      0.87    115164

Classification report for testing data 2
              precision    recall  f1-score   support

           0       0.92      0.91      0.91     86273
           1       0.73      0.76      0.75     28892

    accuracy                           0.87    115165
   macro avg       0.83      0.83      0.83    115165
weighted avg       0.87      0.87      0.87    115165

No description has been provided for this image
In [91]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve

def roc_score_auc_curve_nn(model, X_train, y_train, X_test1, y_test1, X_test2, y_test2):
    # Getting the predicted probabilities
    train_preds = model.predict(X_train).ravel()
    test_preds1 = model.predict(X_test1).ravel()
    test_preds2 = model.predict(X_test2).ravel()
    
    # Calculating ROC AUC scores
    a = round(roc_auc_score(y_train, train_preds), 4)
    b = round(roc_auc_score(y_test1, test_preds1), 4)
    c = round(roc_auc_score(y_test2, test_preds2), 4)
    
    print('AUC Score for Model on Training Data is', a)
    print('AUC Score for Model on Testing Data 1 is', b)
    print('AUC Score for Model on Testing Data 2 is', c)
    
    # Plotting ROC curves
    plt.figure(figsize=(12, 7))
    
    # Training Data
    train_fpr, train_tpr, _ = roc_curve(y_train, train_preds)
    plt.plot(train_fpr, train_tpr, label=f'Train AUC: {a}', marker='.')
    
    # Test Data 1
    test_fpr, test_tpr, _ = roc_curve(y_test1, test_preds1)
    plt.plot(test_fpr, test_tpr, label=f'Test 1 AUC: {b}', marker='o')
    
    # Test Data 2
    test_fpr2, test_tpr2, _ = roc_curve(y_test2, test_preds2)
    plt.plot(test_fpr2, test_tpr2, label=f'Test 2 AUC: {c}', marker='x')
    
    # Line of No Discrimination
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='best')
    plt.show()
In [92]:
roc_score_auc_curve_nn(nn_final, X_train, y_train, X_test1, y_test1, X_test2, y_test2)
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 11s 642us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 611us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 625us/step
AUC Score for Model on Training Data is 0.934
AUC Score for Model on Testing Data 1 is 0.9346
AUC Score for Model on Testing Data 2 is 0.9329
No description has been provided for this image

Score Bins for Neural Nets¶

In [93]:
# Calculate AUC on each sample

y_train_pred = nn_final.predict(X_train)
y_test1_pred = nn_final.predict(X_test1)
y_test2_pred = nn_final.predict(X_test2)

auc_train = roc_auc_score(y_train, y_train_pred)
auc_test1 = roc_auc_score(y_test1, y_test1_pred)
auc_test2 = roc_auc_score(y_test2, y_test2_pred)

print(f"AUC on Train: {auc_train}")
print(f"AUC on Test 1: {auc_test1}")
print(f"AUC on Test 2: {auc_test2}")

# Define score bins based on the train sample
train_scores = y_train_pred.reshape(-1)
bins = np.percentile(train_scores, [0, 25, 50, 75, 100])

# Calculate the value ranges for each bin
bin_ranges = [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)]

# Apply the same thresholds to test samples
test1_scores = y_test1_pred.reshape(-1)
test2_scores = y_test2_pred.reshape(-1)

# Calculate default rates in each bin for each sample
def calculate_default_rate(scores, y):
    default_rates = []
    for i in range(len(bins) - 1):
        mask = (scores >= bins[i]) & (scores <= bins[i + 1])
        if np.sum(mask) > 0:
            default_rate = np.sum(y[mask]) / np.sum(mask)
            default_rates.append(default_rate)
        else:
            default_rates.append(0.0)
    return default_rates

default_rates_train = calculate_default_rate(train_scores, y_train)
default_rates_test1 = calculate_default_rate(test1_scores, y_test1)
default_rates_test2 = calculate_default_rate(test2_scores, y_test2)

x_indexes = np.arange(len(bin_ranges))

width = 0.25
plt.figure(figsize=(12, 6))
plt.bar(x_indexes - width, default_rates_train, width=width, label='Train', align='center')
plt.bar(x_indexes, default_rates_test1, width=width, label='Test 1', align='center')
plt.bar(x_indexes + width, default_rates_test2, width=width, label='Test 2', align='center')

plt.xlabel('Score Bins')
plt.ylabel('Default Rate')
plt.title('Rank Orderings by Score Bins')
plt.xticks(x_indexes, bin_ranges, rotation=45)  # Display bin ranges on the x-axis
plt.legend()
plt.grid(True)
plt.show()
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 11s 650us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 3s 743us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 669us/step
AUC on Train: 0.9340498172065264
AUC on Test 1: 0.9346269463697459
AUC on Test 2: 0.9329086688709763
No description has been provided for this image
In [ ]:
 

Choosing Between NN and XGB¶

In [94]:
best_model_nn
Out[94]:
# HL                          4
# Node                        6
Activation Function        relu
Dropout                     0.0
Batch Size                  100
AUC Train              0.934501
AUC Test 1             0.935337
AUC Test 2             0.933631
Average AUC             0.93449
Std AUC                0.000853
AUC Diff               0.000016
Name: 26, dtype: object
In [96]:
best_model_xgb
Out[96]:
n_estimators                 50
learning_rate              0.01
Subsample %               50.0%
Features                  50.0%
% Weight of Default           5
AUC Train              0.926768
AUC Test 1             0.926775
AUC Test 2             0.925061
Average AUC            0.926201
Std AUC                0.000988
AUC Diff               0.000007
Name: 2, dtype: object
In [97]:
# Calculate average AUC scores directly within a dictionary for easier comparison
avg_aucs = {
    "XGB Model": (best_model_xgb['AUC Test 1'] + best_model_xgb['AUC Test 2']) / 2,
    "NN Model": (best_model_nn['AUC Test 1'] + best_model_nn['AUC Test 2']) / 2
}

# Print the average AUC values for both models
print(f"Avg AUC value of XGB Model: {avg_aucs['XGB Model']:.5f}")
print(f"Avg AUC value of NN Model: {avg_aucs['NN Model']:.5f}")

# Identify the best model based on the highest average AUC score
best_model, best_avg_auc = max(avg_aucs.items(), key=lambda item: item[1])

# Print the result stating the best model and its average AUC value
print(f"The best model is {best_model} with Avg AUC value of: {best_avg_auc:.5f}")
Avg AUC value of XGB Model: 0.92592
Avg AUC value of NN Model: 0.93448
The best model is NN Model with Avg AUC value of: 0.93448
In [ ]:
 

Exporting the Neural Network Model and XGBoost Model¶

In [98]:
xgb_final.save_model('best_xgb_model.json')
In [99]:
best_nn_model = nn_final
best_nn_model

best_nn_model.save('best_nn_model.keras')
In [ ]:
 
In [ ]:
 

Strategy¶

In [100]:
X = df3.drop(['customer_ID','S_2','target'], axis=1)
y = df3['target']

X_train, X_test1, y_train, y_test1 = train_test_split(X, y, test_size=0.3, random_state=42)
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test1, y_test1, test_size=0.5, random_state=42)
In [102]:
# Get predictions for the train dataset 
nn_predictions_train = (nn_final.predict(X_train) > 0.5).astype("int32")

# Get predictions for the test1 dataset
nn_predictions_test1 = (nn_final.predict(X_test1) > 0.5).astype("int32")

# Get predictions for the test2 dataset
nn_predictions_test2 = (nn_final.predict(X_test2) > 0.5).astype("int32")

# Get predicted probabilities for the train dataset
nn_probs_train = nn_final.predict(X_train)

# Get predicted probabilities for the test1 dataset
nn_probs_test1 = nn_final.predict(X_test1)

# Get predicted probabilities for the test2 dataset
nn_probs_test2 = nn_final.predict(X_test2)
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 12s 704us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 681us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 3s 720us/step
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 615us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 671us/step
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 614us/step

Training Data¶

In [108]:
# Define the thresholds
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Calculate the average spend and balance for the last 6 months (November 2017 to April 2018)
start_date = pd.to_datetime("2017-11-01")
end_date = pd.to_datetime("2018-04-30")
date_feature_train = pd.to_datetime(df3['S_2'])  # Convert to datetime if not already
filtered_data_train = df3[(date_feature_train >= start_date) & (date_feature_train <= end_date)]
average_spend_train = filtered_data_train['S_3'].mean()
average_balance_train = filtered_data_train['B_10'].mean()

# Calculate the monthly revenue for 1 customer
monthly_revenue_train = average_balance_train * 0.02 + average_spend_train * 0.001

# Calculate the expected annual revenue over the next 12 months
expected_revenue_train = monthly_revenue_train * 12

# Assuming your neural network model is fitted and ready to make predictions
# Get predicted probabilities for the train dataset using your TensorFlow model
# Flatten the output if it's not already flat
nn_probs_train_flat = nn_final.predict(X_train).flatten()

# Initialize an empty list to store dictionaries of the results
results = []

for threshold in thresholds:
    # Determine accepted applicants based on the threshold
    accepted_indices_train = nn_probs_train_flat < threshold
    total_applicants_train = np.sum(accepted_indices_train)
    defaulted_applicants_train = np.sum(y_train[accepted_indices_train])

    # Avoid division by zero
    default_rate_train = defaulted_applicants_train / total_applicants_train if total_applicants_train > 0 else 0
    
    # Calculate the portfolio revenue
    portfolio_revenue_value_train = expected_revenue_train * total_applicants_train

    # Append a dictionary of results for this threshold
    results.append({
        'Threshold': threshold,
        'Default Rate': default_rate_train,
        'Portfolio Revenue': portfolio_revenue_value_train,
        'Applicant Count': total_applicants_train
    })

# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(results)

# Display the DataFrame
results_df
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 11s 655us/step
Out[108]:
Threshold Default Rate Portfolio Revenue Applicant Count
0 0.1 0.216867 9.538479 166
1 0.2 0.110650 41.544097 723
2 0.3 0.010962 1493.863657 25998
3 0.4 0.038721 4359.889176 75876
4 0.5 0.112524 5792.844475 100814
5 0.6 0.201083 6897.009566 120030
6 0.7 0.235884 7283.547793 126757
7 0.8 0.245422 7382.610064 128481
8 0.9 0.249770 7428.463715 129279
9 1.0 0.256160 7502.243272 130563
In [113]:
import plotly.graph_objects as go

def plot_default_rate_and_revenue_vs_threshold(df):
    fig = go.Figure()

    # Default Rate
    fig.add_trace(go.Scatter(x=df['Threshold'], y=df['Default Rate'],
                        mode='lines+markers',
                        name='Default Rate'))

    # Portfolio Revenue
    fig.add_trace(go.Scatter(x=df['Threshold'], y=df['Portfolio Revenue'],
                        mode='lines+markers',
                        name='Portfolio Revenue',
                        yaxis='y2'))

    # Create axis objects
    fig.update_layout(
        title='Default Rate and Portfolio Revenue vs. Threshold',
        xaxis=dict(title='Threshold'),
        yaxis=dict(title='Default Rate'),
        yaxis2=dict(title='Portfolio Revenue', overlaying='y', side='right'),
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
        height=600
    )

    fig.show()

plot_default_rate_and_revenue_vs_threshold(results_df)
In [118]:
from plotly.subplots import make_subplots

def plot_combined_metrics(df):
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    fig.add_trace(go.Scatter(x=df['Threshold'], y=df['Default Rate'], name="Default Rate"), secondary_y=False)
    fig.add_trace(go.Scatter(x=df['Threshold'], y=df['Portfolio Revenue'], name="Portfolio Revenue"), secondary_y=True)
    fig.add_trace(go.Bar(x=df['Threshold'], y=df['Applicant Count'], name="Applicant Count", marker_color='lightblue'), secondary_y=False)

    # Add figure title
    fig.update_layout(title_text="Combined Metrics vs. Threshold",
                     height=600)

    # Set x-axis title
    fig.update_xaxes(title_text="Threshold")

    # Set y-axes titles
    fig.update_yaxes(title_text="Default Rate / Applicant Count", secondary_y=False)
    fig.update_yaxes(title_text="Portfolio Revenue", secondary_y=True)

    fig.show()



plot_combined_metrics(results_df)
In [ ]:
 

Test 1¶

In [109]:
# Define the thresholds
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Calculate the average spend and balance for the last 6 months (November 2017 to April 2018)
start_date = pd.to_datetime("2017-11-01")
end_date = pd.to_datetime("2018-04-30")
date_feature_test1 = pd.to_datetime(df3['S_2'])  # Convert to datetime if not already
filtered_data_test1 = df3[(date_feature_test1 >= start_date) & (date_feature_test1 <= end_date)]
average_spend_test1 = filtered_data_test1['S_3'].mean()
average_balance_test1 = filtered_data_test1['B_10'].mean()

# Calculate the monthly revenue for 1 customer based on test1 data
monthly_revenue_test1 = average_balance_test1 * 0.02 + average_spend_test1 * 0.001

# Calculate the expected annual revenue over the next 12 months based on test1 data
expected_revenue_test1 = monthly_revenue_test1 * 12

# Get predicted probabilities for the test1 dataset using your TensorFlow model
nn_probs_test1_flat = nn_final.predict(X_test1).flatten()

# Initialize an empty list to store dictionaries of results for test1 data
results_test1 = []

for threshold in thresholds:
    # Determine accepted applicants based on the threshold for test1 data
    accepted_indices_test1 = nn_probs_test1_flat < threshold
    total_applicants_test1 = np.sum(accepted_indices_test1)
    defaulted_applicants_test1 = np.sum(y_test1[accepted_indices_test1])

    # Avoid division by zero
    default_rate_test1 = defaulted_applicants_test1 / total_applicants_test1 if total_applicants_test1 > 0 else 0
    
    # Calculate the portfolio revenue for test1 data
    portfolio_revenue_value_test1 = expected_revenue_test1 * total_applicants_test1

    # Append a dictionary of results for this threshold for test1 data
    results_test1.append({
        'Threshold': threshold,
        'Default Rate': default_rate_test1,
        'Portfolio Revenue': portfolio_revenue_value_test1,
        'Applicant Count': total_applicants_test1
    })

# Convert the list of dictionaries to a DataFrame for test1 data
results_df_test1 = pd.DataFrame(results_test1)

# Display the DataFrame for test1 data
results_df_test1
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 631us/step
Out[109]:
Threshold Default Rate Portfolio Revenue Applicant Count
0 0.1 0.314286 2.011125 35
1 0.2 0.132911 9.078793 158
2 0.3 0.012256 323.503823 5630
3 0.4 0.038199 935.632815 16283
4 0.5 0.110537 1246.552741 21694
5 0.6 0.198991 1481.337221 25780
6 0.7 0.235149 1565.114943 27238
7 0.8 0.244901 1586.088103 27603
8 0.9 0.249721 1596.890718 27791
9 1.0 0.256587 1613.726707 28084
In [119]:
plot_default_rate_and_revenue_vs_threshold(results_df_test1)
In [120]:
plot_combined_metrics(results_df_test1)

Test 2¶

In [110]:
# Define the thresholds
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Calculate the average spend and balance for the last 6 months (November 2017 to April 2018)
start_date = pd.to_datetime("2017-11-01")
end_date = pd.to_datetime("2018-04-30")
date_feature_test2 = pd.to_datetime(df3['S_2'])  # Convert to datetime if not already
filtered_data_test2 = df3[(date_feature_test2 >= start_date) & (date_feature_test2 <= end_date)]
average_spend_test2 = filtered_data_test2['S_3'].mean()
average_balance_test2 = filtered_data_test2['B_10'].mean()

# Calculate the monthly revenue for 1 customer based on test2 data
monthly_revenue_test2 = average_balance_test2 * 0.02 + average_spend_test2 * 0.001

# Calculate the expected annual revenue over the next 12 months based on test2 data
expected_revenue_test2 = monthly_revenue_test2 * 12

# Get predicted probabilities for the test2 dataset using your TensorFlow model
nn_probs_test2_flat = nn_final.predict(X_test2).flatten()

# Initialize an empty list to store dictionaries of results for test2 data
results_test2 = []

for threshold in thresholds:
    # Determine accepted applicants based on the threshold for test2 data
    accepted_indices_test2 = nn_probs_test2_flat < threshold
    total_applicants_test2 = np.sum(accepted_indices_test2)
    defaulted_applicants_test2 = np.sum(y_test2[accepted_indices_test2])

    # Avoid division by zero
    default_rate_test2 = defaulted_applicants_test2 / total_applicants_test2 if total_applicants_test2 > 0 else 0
    
    # Calculate the portfolio revenue for test2 data
    portfolio_revenue_value_test2 = expected_revenue_test2 * total_applicants_test2

    # Append a dictionary of results for this threshold for test2 data
    results_test2.append({
        'Threshold': threshold,
        'Default Rate': default_rate_test2,
        'Portfolio Revenue': portfolio_revenue_value_test2,
        'Applicant Count': total_applicants_test2
    })

# Convert the list of dictionaries to a DataFrame for test2 data
results_df_test2 = pd.DataFrame(results_test2)

# Display the DataFrame for test2 data
results_df_test2
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 3s 694us/step
Out[110]:
Threshold Default Rate Portfolio Revenue Applicant Count
0 0.1 0.215686 2.930496 51
1 0.2 0.141935 8.906411 155
2 0.3 0.011758 322.526991 5613
3 0.4 0.041615 929.254675 16172
4 0.5 0.117014 1239.427613 21570
5 0.6 0.206836 1480.992456 25774
6 0.7 0.241552 1562.644132 27195
7 0.8 0.251034 1583.962057 27566
8 0.9 0.255873 1594.879593 27756
9 1.0 0.261905 1609.704457 28014
In [121]:
plot_default_rate_and_revenue_vs_threshold(results_df_test2)
In [122]:
plot_combined_metrics(results_df_test2)
In [ ]: